diff --git a/SpecForge-ext/.github/ISSUE_TEMPLATE/1-bug-report.yaml b/SpecForge-ext/.github/ISSUE_TEMPLATE/1-bug-report.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41fa058c4aff03bdeb9e04b06e5d2129fd2e57f1 --- /dev/null +++ b/SpecForge-ext/.github/ISSUE_TEMPLATE/1-bug-report.yaml @@ -0,0 +1,38 @@ +name: 🐞 Bug report +description: Create a report to help us reproduce and fix the bug +title: "[Bug] " +labels: ['Bug'] + +body: +- type: checkboxes + attributes: + label: Checklist + options: + - label: 1. I have searched related issues but cannot get the expected help. + - label: 2. The bug has not been fixed in the latest version. + - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback. + - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/SpecForge/discussions/new/choose Otherwise, it will be closed. + - label: 5. Please use English, otherwise it will be closed. +- type: textarea + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. + validations: + required: true +- type: textarea + attributes: + label: Reproduction + description: | + What command or script did you run? Which **model** are you using? + placeholder: | + A placeholder for the command. + validations: + required: true +- type: textarea + attributes: + label: Environment + description: | + Please provide necessary environment information here. Otherwise the issue will be closed. + placeholder: Environment here. + validations: + required: true diff --git a/SpecForge-ext/.github/ISSUE_TEMPLATE/2-feature-request.yaml b/SpecForge-ext/.github/ISSUE_TEMPLATE/2-feature-request.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6fc81989429af5d3096a389db9adb6ec3993d60 --- /dev/null +++ b/SpecForge-ext/.github/ISSUE_TEMPLATE/2-feature-request.yaml @@ -0,0 +1,23 @@ +name: 🚀 Feature request +description: Suggest an idea for this project +title: "[Feature] " + +body: +- type: checkboxes + attributes: + label: Checklist + options: + - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/SpecForge/discussions/new/choose Otherwise, it will be closed. + - label: 2. Please use English, otherwise it will be closed. +- type: textarea + attributes: + label: Motivation + description: | + A clear and concise description of the motivation of the feature. + validations: + required: true +- type: textarea + attributes: + label: Related resources + description: | + If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful. diff --git a/SpecForge-ext/.github/workflows/lint.yaml b/SpecForge-ext/.github/workflows/lint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3cf35a6be5986ecd8e9f90cef12a75438e8401d6 --- /dev/null +++ b/SpecForge-ext/.github/workflows/lint.yaml @@ -0,0 +1,22 @@ +name: Lint + +on: [ pull_request ] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install pre-commit hook + run: | + python -m pip install pre-commit + pre-commit install + + - name: Linting + run: pre-commit run --all-files --show-diff-on-failure diff --git a/SpecForge-ext/.github/workflows/publish_docs.yaml b/SpecForge-ext/.github/workflows/publish_docs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..27f4639d2eb35474f4865f57f9031e18df722942 --- /dev/null +++ b/SpecForge-ext/.github/workflows/publish_docs.yaml @@ -0,0 +1,72 @@ +name: Release Documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + - "version.txt" + workflow_dispatch: + +concurrency: + group: release-docs-${{ github.ref }} + cancel-in-progress: true + +jobs: + deploy-github-pages: + runs-on: ubuntu-latest + if: github.repository == 'sgl-project/specforge' || github.repository == 'sleepcoo/SpecForge' + permissions: + contents: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: docs/spec_bundle/package-lock.json + + - name: Install dependencies + run: | + sudo apt-get update && sudo apt-get install -y pandoc parallel retry + pip install -r docs/requirements.txt + + - name: Build spec bundle dashboard + run: | + # Copy logos to public directory + cp assets/logo.png docs/spec_bundle/public/logo.png + cp docs/_static/imgs/specbundle-logo.png docs/spec_bundle/public/specbundle-logo.png + cd docs/spec_bundle + npm ci + npm run build + # Clean up node_modules to prevent Sphinx from processing them + rm -rf node_modules + cd .. + + - name: Build documentation + run: | + cd docs + make compile + make html + # Copy SpecBundle to root of output directory + mkdir -p _build/html/SpecBundle + cp -r spec_bundle/dist/* _build/html/SpecBundle/ + + - name: Add .nojekyll file + run: | + touch ./docs/_build/html/.nojekyll + + - name: Deploy + uses: peaceiris/actions-gh-pages@v4 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./docs/_build/html diff --git a/SpecForge-ext/.github/workflows/publish_pypi.yaml b/SpecForge-ext/.github/workflows/publish_pypi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b2c68f1cd16ae6ada552340aef58265c7daeafb --- /dev/null +++ b/SpecForge-ext/.github/workflows/publish_pypi.yaml @@ -0,0 +1,33 @@ +name: Publish to PyPI + +on: + workflow_dispatch: + +jobs: + build-n-publish: + if: github.event_name == 'workflow_dispatch' + name: Build and publish Python distributions to PyPI + runs-on: ubuntu-latest + timeout-minutes: 20 + environment: + name: pypi + url: https://pypi.org/p/specforgeee + permissions: + id-token: write + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: '3.11' + + - run: python setup.py sdist build + + # publish to PyPI if executed on the main branch + - name: Publish package to PyPI + id: publish + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_TOKEN }} + verbose: true diff --git a/SpecForge-ext/.github/workflows/test.yaml b/SpecForge-ext/.github/workflows/test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328dd8a17769929b692afac84729b34a2551448f --- /dev/null +++ b/SpecForge-ext/.github/workflows/test.yaml @@ -0,0 +1,63 @@ +name: PR Test + +on: + pull_request: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: pr-test-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + unit-test: + if: (github.repository == 'sgl-project/SpecForge' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: [self-hosted] + container: + image: lmsysorg/sglang:v0.5.5 # we lock to this version to avoid repeated docker pull + options: --gpus all --shm-size=2g --rm -v /dev/shm + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Restore cache + run: | + if [ -d /github/home/cache ] && [ ! -z "$(ls -A /github/home/cache/)" ]; then + cp -p -r /github/home/cache ./ + fi + + if [ -d /github/home/sf ] && [ ! -z "$(ls -A /github/home/sf/)" ]; then + cp -p -r /github/home/sf ./ + fi + + - name: Remove flashinfer # this is needed to avoid flashinfer jit compilation makes the program hang + run: | + rm -rf /github/home/.cache/flashinfer + + - name: Install dependencies + shell: bash + run: | + # if sf venv does not exist, create it + if [ ! -d sf ]; then + uv venv sf -p 3.11 + fi + source sf/bin/activate + uv pip install setuptools + MAX_JOBS=8 uv pip install -v ".[fa]" --prerelease=allow --no-build-isolation + + - name: Run test + timeout-minutes: 30 + shell: bash + run: | + source sf/bin/activate + export PYTHONPATH=$PWD + python -m unittest discover -s ./tests -p "test_*.py" -v + + - name: Save cache + run: | + cp -p -r sf /github/home/ + cp -p -r cache /github/home/ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/b7/fb7yof3yk2k4yeeufmf3rp4g2vv24l2ugw2wy6l6vsmz4q7x37uw/dqu44yvnqab6lpc7c524ppofvnrpyphyicmvwgol774itktqbe6 b/SpecForge-ext/cache/compiled_kernels/fxgraph/b7/fb7yof3yk2k4yeeufmf3rp4g2vv24l2ugw2wy6l6vsmz4q7x37uw/dqu44yvnqab6lpc7c524ppofvnrpyphyicmvwgol774itktqbe6 new file mode 100644 index 0000000000000000000000000000000000000000..c5e4da65451c9fa1978afc0d82f1f01e1a4387da Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/b7/fb7yof3yk2k4yeeufmf3rp4g2vv24l2ugw2wy6l6vsmz4q7x37uw/dqu44yvnqab6lpc7c524ppofvnrpyphyicmvwgol774itktqbe6 differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/d5/fd5mtfiljkcqso2ovhkqewcgmm352ybgny3jo64kzoxueahy6joc/mdrdlva6q5sia32yf5vu6qd2ly7pmheoa7pyfstdzofb743b37j b/SpecForge-ext/cache/compiled_kernels/fxgraph/d5/fd5mtfiljkcqso2ovhkqewcgmm352ybgny3jo64kzoxueahy6joc/mdrdlva6q5sia32yf5vu6qd2ly7pmheoa7pyfstdzofb743b37j new file mode 100644 index 0000000000000000000000000000000000000000..d212f5a1bc4e0d85acd3ca5d399e960cdf63ec06 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/d5/fd5mtfiljkcqso2ovhkqewcgmm352ybgny3jo64kzoxueahy6joc/mdrdlva6q5sia32yf5vu6qd2ly7pmheoa7pyfstdzofb743b37j differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/gy/fgyjasy24lyvf45hmbtxzqve4lgbh5xzxmkza7fmcqqortd67gcc/4bg5eqhunja4mv5ckfxus66wcew7soy42pgbqzxrrjxj7hxkuyi b/SpecForge-ext/cache/compiled_kernels/fxgraph/gy/fgyjasy24lyvf45hmbtxzqve4lgbh5xzxmkza7fmcqqortd67gcc/4bg5eqhunja4mv5ckfxus66wcew7soy42pgbqzxrrjxj7hxkuyi new file mode 100644 index 0000000000000000000000000000000000000000..611c3642bdec7024e5745dfe4e9010d3fe351a65 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/gy/fgyjasy24lyvf45hmbtxzqve4lgbh5xzxmkza7fmcqqortd67gcc/4bg5eqhunja4mv5ckfxus66wcew7soy42pgbqzxrrjxj7hxkuyi differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/kp/fkp2diorfj5u3lv4yqas3fhord3y5dha4rxjvk6clv6mpo6wq5ts/72shc3jpmfkbxncw3rpaeflm7lrlu74btuqitszauoym3ykbgak b/SpecForge-ext/cache/compiled_kernels/fxgraph/kp/fkp2diorfj5u3lv4yqas3fhord3y5dha4rxjvk6clv6mpo6wq5ts/72shc3jpmfkbxncw3rpaeflm7lrlu74btuqitszauoym3ykbgak new file mode 100644 index 0000000000000000000000000000000000000000..f2c462737b1e3ca5b694e1a793fe76f660b1367d Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/kp/fkp2diorfj5u3lv4yqas3fhord3y5dha4rxjvk6clv6mpo6wq5ts/72shc3jpmfkbxncw3rpaeflm7lrlu74btuqitszauoym3ykbgak differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/ut/futctst56igpyuhuqwbj7ifo6wjbelbfftnnnzt6mvpdv4laznjz/dhqctmvlhieh3qlw4a5j6y2cphxvtax4r6yljewoxpn6hjg6coj b/SpecForge-ext/cache/compiled_kernels/fxgraph/ut/futctst56igpyuhuqwbj7ifo6wjbelbfftnnnzt6mvpdv4laznjz/dhqctmvlhieh3qlw4a5j6y2cphxvtax4r6yljewoxpn6hjg6coj new file mode 100644 index 0000000000000000000000000000000000000000..d0f2988b17b49a78c25e08fb7c7498de4529e831 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/ut/futctst56igpyuhuqwbj7ifo6wjbelbfftnnnzt6mvpdv4laznjz/dhqctmvlhieh3qlw4a5j6y2cphxvtax4r6yljewoxpn6hjg6coj differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/va/fvayrjdgzr3pmbuvfegior263vfw5xzrxpu77pd2o4whnn4i7oe2/r46qg2hi5tlizn4e6hm6gfprjga5kb46ijq7utpaiuhyp7zzokk b/SpecForge-ext/cache/compiled_kernels/fxgraph/va/fvayrjdgzr3pmbuvfegior263vfw5xzrxpu77pd2o4whnn4i7oe2/r46qg2hi5tlizn4e6hm6gfprjga5kb46ijq7utpaiuhyp7zzokk new file mode 100644 index 0000000000000000000000000000000000000000..d675d076b00e49c76c874d2c172589afa86db491 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/va/fvayrjdgzr3pmbuvfegior263vfw5xzrxpu77pd2o4whnn4i7oe2/r46qg2hi5tlizn4e6hm6gfprjga5kb46ijq7utpaiuhyp7zzokk differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/yw/fyw74tihmwdurnkl74w5ng6i55dk7dj65ql2ezo5bq4cxgbcw5p5/qhhy6gvdfgumsjojnqxnbpx2e5yb5fsfqmeafki2x7itryz4zt3 b/SpecForge-ext/cache/compiled_kernels/fxgraph/yw/fyw74tihmwdurnkl74w5ng6i55dk7dj65ql2ezo5bq4cxgbcw5p5/qhhy6gvdfgumsjojnqxnbpx2e5yb5fsfqmeafki2x7itryz4zt3 new file mode 100644 index 0000000000000000000000000000000000000000..1f1e9b8565a7ad8f63d1e6ab6cf21176d19a3d87 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/yw/fyw74tihmwdurnkl74w5ng6i55dk7dj65ql2ezo5bq4cxgbcw5p5/qhhy6gvdfgumsjojnqxnbpx2e5yb5fsfqmeafki2x7itryz4zt3 differ diff --git a/SpecForge-ext/cache/compiled_kernels/fxgraph/zh/fzh3cbljcdwt76rsppwrcnk6dkxcmc5r6vtprpqx5patcfb3rsuv/z5dccw35nurtwmemllsbeulds4dudef7jdbkz5xya7g52sv36tr b/SpecForge-ext/cache/compiled_kernels/fxgraph/zh/fzh3cbljcdwt76rsppwrcnk6dkxcmc5r6vtprpqx5patcfb3rsuv/z5dccw35nurtwmemllsbeulds4dudef7jdbkz5xya7g52sv36tr new file mode 100644 index 0000000000000000000000000000000000000000..1be7c2863277c947eed1cdd0c9aeb2fa84674997 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/fxgraph/zh/fzh3cbljcdwt76rsppwrcnk6dkxcmc5r6vtprpqx5patcfb3rsuv/z5dccw35nurtwmemllsbeulds4dudef7jdbkz5xya7g52sv36tr differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cea4282d3fb1ea06bf8099edb7d3bf62bc273396 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0e3b56d00fa9ef41a3beaec380ff3e3407c40f9f Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a7a4b02be5a0a28ef29d5c1011517c72bc3fa6aa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "d764c4de3a434d91651ee340a32edffa72e41bcb0125e69cee815d3536f2f3ce", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..c47964812eca8f083a1f4c654ab198d37030d622 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir @@ -0,0 +1,206 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 6, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 252, !dbg !9 + %13 = lshr exact i32 %12, 2, !dbg !9 + %14 = or disjoint i32 %13, %10, !dbg !10 + %15 = icmp slt i32 %14, %4, !dbg !11 + %16 = and i32 %11, 3, !dbg !12 + %17 = sext i32 %14 to i64, !dbg !13 + %.frozen = freeze i64 %2, !dbg !14 + %18 = sdiv i64 %17, %.frozen, !dbg !14 + %19 = mul i64 %18, %.frozen, !dbg !13 + %.decomposed = sub i64 %17, %19, !dbg !13 + %20 = mul i64 %18, %3, !dbg !15 + %.idx = mul nsw i64 %.decomposed, 128000 + %21 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %21, i64 %20, !dbg !16 + %.fr = freeze i1 %15 + %22 = zext nneg i32 %16 to i64, !dbg !16 + br i1 %.fr, label %.split.us, label %.split.preheader + +.split.preheader: ; preds = %8 + %invariant.gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %22, !dbg !16 + br label %.split, !dbg !16 + +.split.us: ; preds = %8, %.split.us + %indvars.iv7 = phi i64 [ %indvars.iv.next8, %.split.us ], [ 0, %8 ] + %23 = phi i32 [ %44, %.split.us ], [ 2147483647, %8 ] + %24 = phi float [ %42, %.split.us ], [ 0xFFF0000000000000, %8 ] + %25 = or disjoint i64 %indvars.iv7, %22, !dbg !17 + %gep.us = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %25, !dbg !18 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %27 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep.us, i64 %26, i1 true) #4, !dbg !19 + %28 = bitcast i32 %27 to float, !dbg !19 + %29 = fcmp ogt float %24, %28, !dbg !20 + %30 = fcmp oeq float %24, %28, !dbg !24 + %31 = fcmp uno float %24, 0.000000e+00, !dbg !25 + %32 = fcmp uno float %28, 0.000000e+00, !dbg !26 + %33 = xor i1 %32, true, !dbg !27 + %34 = and i1 %31, %33, !dbg !28 + %35 = or i1 %29, %34, !dbg !29 + %36 = and i1 %31, %32, !dbg !30 + %37 = or i1 %30, %36, !dbg !31 + %38 = sext i32 %23 to i64, !dbg !32 + %39 = icmp sgt i64 %25, %38, !dbg !32 + %40 = and i1 %39, %37, !dbg !33 + %41 = or i1 %35, %40, !dbg !34 + %42 = select i1 %41, float %24, float %28, !dbg !35 + %43 = trunc nuw nsw i64 %25 to i32, !dbg !36 + %44 = select i1 %41, i32 %23, i32 %43, !dbg !36 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 4, !dbg !16 + %45 = icmp samesign ult i64 %indvars.iv7, 31996, !dbg !16 + br i1 %45, label %.split.us, label %.split3.us, !dbg !16 + +.split: ; preds = %.split.preheader, %.split + %indvars.iv = phi i64 [ 0, %.split.preheader ], [ %indvars.iv.next, %.split ] + %gep12 = getelementptr float, ptr addrspace(1) %invariant.gep11, i64 %indvars.iv, !dbg !18 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %47 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep12, i64 %46, i1 false) #4, !dbg !19 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !16 + %48 = icmp samesign ult i64 %indvars.iv, 31996, !dbg !16 + br i1 %48, label %.split, label %.split3.us, !dbg !16 + +.split3.us: ; preds = %.split, %.split.us + %.us-phi = phi float [ %42, %.split.us ], [ 0xFFF0000000000000, %.split ], !dbg !9 + %.us-phi4 = phi i32 [ %44, %.split.us ], [ 2147483647, %.split ], !dbg !9 + %49 = and i32 %11, 63, !dbg !9 + %50 = or disjoint i32 %10, %49, !dbg !10 + %51 = icmp slt i32 %50, %4, !dbg !11 + %52 = bitcast float %.us-phi to i32, !dbg !37 + %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 2, i32 31), !dbg !37 + %54 = bitcast i32 %53 to float, !dbg !37 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %.us-phi4, i32 2, i32 31), !dbg !37 + %56 = fcmp ogt float %.us-phi, %54, !dbg !39 + %57 = fcmp oeq float %.us-phi, %54, !dbg !40 + %58 = fcmp uno float %.us-phi, 0.000000e+00, !dbg !41 + %59 = fcmp uno float %54, 0.000000e+00, !dbg !42 + %60 = xor i1 %59, true, !dbg !43 + %61 = and i1 %58, %60, !dbg !44 + %62 = or i1 %56, %61, !dbg !45 + %63 = and i1 %58, %59, !dbg !46 + %64 = or i1 %57, %63, !dbg !47 + %65 = icmp slt i32 %.us-phi4, %55, !dbg !48 + %66 = and i1 %65, %64, !dbg !49 + %67 = or i1 %62, %66, !dbg !50 + %68 = select i1 %67, float %.us-phi, float %54, !dbg !51 + %69 = select i1 %67, i32 %.us-phi4, i32 %55, !dbg !52 + %70 = bitcast float %68 to i32, !dbg !37 + %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 1, i32 31), !dbg !37 + %72 = bitcast i32 %71 to float, !dbg !37 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 1, i32 31), !dbg !37 + %74 = fcmp ogt float %68, %72, !dbg !39 + %75 = fcmp oeq float %68, %72, !dbg !40 + %76 = fcmp uno float %68, 0.000000e+00, !dbg !41 + %77 = fcmp uno float %72, 0.000000e+00, !dbg !42 + %78 = xor i1 %77, true, !dbg !43 + %79 = and i1 %76, %78, !dbg !44 + %80 = or i1 %74, %79, !dbg !45 + %81 = and i1 %77, %76, !dbg !46 + %82 = or i1 %75, %81, !dbg !47 + %83 = icmp slt i32 %69, %73, !dbg !48 + %84 = and i1 %83, %82, !dbg !49 + %85 = or i1 %80, %84, !dbg !50 + %86 = select i1 %85, i32 %69, i32 %73, !dbg !52 + %87 = sext i32 %50 to i64, !dbg !53 + %88 = getelementptr i64, ptr addrspace(1) %1, i64 %87, !dbg !53 + %89 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %12, !dbg !54 + %90 = insertelement <1 x i32> poison, i32 %86, i64 0, !dbg !54 + store <1 x i32> %90, ptr addrspace(3) %89, align 4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %91 = shl nuw nsw i32 %49, 2, !dbg !54 + %92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !54 + %93 = load i32, ptr addrspace(3) %92, align 4, !dbg !54 + %94 = sext i32 %93 to i64, !dbg !54 + %95 = and i32 %11, 192, !dbg !54 + %96 = icmp eq i32 %95, 0, !dbg !54 + %97 = and i1 %96, %51, !dbg !54 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %94, ptr addrspace(1) %88, i1 %97) #4, !dbg !54 + ret void, !dbg !55 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 19, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 32, column: 40, scope: !4) +!17 = !DILocation(line: 33, column: 31, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 144, column: 21, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0) +!22 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!23 = !DILocation(line: 41, column: 38, scope: !4) +!24 = !DILocation(line: 145, column: 23, scope: !21, inlinedAt: !23) +!25 = !DILocation(line: 147, column: 29, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 148, column: 29, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 149, column: 31, scope: !21, inlinedAt: !23) +!28 = !DILocation(line: 149, column: 27, scope: !21, inlinedAt: !23) +!29 = !DILocation(line: 149, column: 16, scope: !21, inlinedAt: !23) +!30 = !DILocation(line: 151, column: 27, scope: !21, inlinedAt: !23) +!31 = !DILocation(line: 151, column: 17, scope: !21, inlinedAt: !23) +!32 = !DILocation(line: 154, column: 31, scope: !21, inlinedAt: !23) +!33 = !DILocation(line: 154, column: 21, scope: !21, inlinedAt: !23) +!34 = !DILocation(line: 154, column: 12, scope: !21, inlinedAt: !23) +!35 = !DILocation(line: 155, column: 35, scope: !21, inlinedAt: !23) +!36 = !DILocation(line: 155, column: 69, scope: !21, inlinedAt: !23) +!37 = !DILocation(line: 165, column: 42, scope: !21, inlinedAt: !38) +!38 = !DILocation(line: 45, column: 75, scope: !4) +!39 = !DILocation(line: 144, column: 21, scope: !21, inlinedAt: !38) +!40 = !DILocation(line: 145, column: 23, scope: !21, inlinedAt: !38) +!41 = !DILocation(line: 147, column: 29, scope: !21, inlinedAt: !38) +!42 = !DILocation(line: 148, column: 29, scope: !21, inlinedAt: !38) +!43 = !DILocation(line: 149, column: 31, scope: !21, inlinedAt: !38) +!44 = !DILocation(line: 149, column: 27, scope: !21, inlinedAt: !38) +!45 = !DILocation(line: 149, column: 16, scope: !21, inlinedAt: !38) +!46 = !DILocation(line: 151, column: 27, scope: !21, inlinedAt: !38) +!47 = !DILocation(line: 151, column: 17, scope: !21, inlinedAt: !38) +!48 = !DILocation(line: 154, column: 31, scope: !21, inlinedAt: !38) +!49 = !DILocation(line: 154, column: 21, scope: !21, inlinedAt: !38) +!50 = !DILocation(line: 154, column: 12, scope: !21, inlinedAt: !38) +!51 = !DILocation(line: 155, column: 35, scope: !21, inlinedAt: !38) +!52 = !DILocation(line: 155, column: 69, scope: !21, inlinedAt: !38) +!53 = !DILocation(line: 47, column: 25, scope: !4) +!54 = !DILocation(line: 47, column: 36, scope: !4) +!55 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ec4917c5d38aeb582f47f2eb61f121e4d79243aa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,490 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 256 +{ + .reg .pred %p<39>; + .reg .b32 %r<55>; + .reg .b64 %rd<54>; + .loc 1 18 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused_argmax_1_param_4]; +$L__tmp0: + .loc 1 22 28 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 22 33 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:33 + shl.b32 %r1, %r13, 6; + ld.param.b64 %rd20, [triton_red_fused_argmax_1_param_2]; + .loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r4, %r2, 2, 6; + .loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23 + or.b32 %r14, %r4, %r1; + .loc 1 25 37 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:25:37 + and.b32 %r5, %r2, 3; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd1, %r14; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd21, %rd1, %rd20; + and.b64 %rd22, %rd21, -4294967296; + setp.ne.b64 %p1, %rd22, 0; + cvt.u32.u64 %r50, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd49, %rd1, %rd20; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r15, %rd20; + div.u32 %r17, %r50, %r15; + cvt.u64.u32 %rd49, %r17; +$L__BB0_3: + .loc 1 0 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0:19 + ld.param.b64 %rd19, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd18, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd17, [triton_red_fused_argmax_1_param_0]; + and.b32 %r3, %r2, 252; + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + cvt.u64.u32 %rd6, %r5; + setp.ge.s32 %p2, %r50, %r12; + @%p2 bra $L__BB0_6; +// %bb.4: // %.split.us.preheader + shl.b64 %rd35, %rd19, 2; + mul.lo.s64 %rd36, %rd20, 128000; + sub.s64 %rd37, %rd35, %rd36; + mul.lo.s64 %rd38, %rd49, %rd37; + add.s32 %r26, %r1, %r4; + mad.wide.s32 %rd39, %r26, 128000, %rd38; + shl.b64 %rd40, %rd6, 2; + add.s64 %rd41, %rd39, %rd40; + add.s64 %rd50, %rd17, %rd41; + mov.b32 %r53, 0fFF800000; + mov.b32 %r54, 2147483647; + mov.b64 %rd51, 0; +$L__BB0_5: // %.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 38 34 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:34 + add.s64 %rd45, %rd6, %rd51; + .loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61 + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm + mov.b32 %r28, 0; + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r27, %r28; + @%p5 ld.global.L1::evict_first.L2::cache_hint.b32 { %r27 }, [ %rd50 + 0 ], %rd42; + // end inline asm +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p6, %r53, %r27; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.eq.f32 %p7, %r53, %r27; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.nan.f32 %p8, %r53, %r53; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.nan.f32 %p9, %r27, %r27; + setp.num.f32 %p10, %r27, %r27; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p11, %p8, %p10; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p12, %p6, %p11; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p13, %p8, %p9; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p14, %p7, %p13; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + cvt.s64.s32 %rd46, %r54; + setp.gt.s64 %p15, %rd45, %rd46; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p16, %p15, %p14; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p17, %p12, %p16; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + selp.f32 %r53, %r53, %r27, %p17; + cvt.u32.u64 %r29, %rd45; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + selp.b32 %r54, %r54, %r29, %p17; +$L__tmp2: + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + add.s64 %rd11, %rd51, 4; + add.s64 %rd50, %rd50, 16; + setp.lt.u64 %p18, %rd51, 31996; + mov.b64 %rd51, %rd11; + @%p18 bra $L__BB0_5; + bra.uni $L__BB0_8; +$L__BB0_6: // %.split.preheader + shl.b64 %rd24, %rd19, 2; + mul.lo.s64 %rd25, %rd20, 128000; + sub.s64 %rd26, %rd24, %rd25; + mul.lo.s64 %rd27, %rd49, %rd26; + add.s32 %r19, %r1, %r4; + mad.wide.s32 %rd28, %r19, 128000, %rd27; + shl.b64 %rd29, %rd6, 2; + add.s64 %rd30, %rd28, %rd29; + add.s64 %rd52, %rd17, %rd30; + mov.b64 %rd53, -4; +$L__BB0_7: // %.split + // =>This Inner Loop Header: Depth=1 + .loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0; + // end inline asm + mov.b32 %r21, 0; + mov.pred %p3, 0; + // begin inline asm + mov.u32 %r20, %r21; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r20 }, [ %rd52 + 0 ], %rd31; + // end inline asm + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + add.s64 %rd53, %rd53, 4; + add.s64 %rd52, %rd52, 16; + setp.lt.u64 %p4, %rd53, 31996; + mov.b32 %r54, 2147483647; + mov.b32 %r53, 0fFF800000; + @%p4 bra $L__BB0_7; +$L__BB0_8: // %.split3.us + .loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44 + and.b32 %r30, %r2, 63; + .loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23 + or.b32 %r31, %r1, %r30; + .loc 1 24 21 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:24:21 + setp.lt.s32 %p20, %r31, %r12; +$L__tmp3: + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r32, %r53, 2, 31, -1; + shfl.sync.bfly.b32 %r33, %r54, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p21, %r53, %r32; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p22, %r53, %r32; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p23, %r53, %r53; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p24, %r32, %r32; + setp.num.f32 %p25, %r32, %r32; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p26, %p23, %p25; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p27, %p21, %p26; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p28, %p23, %p24; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p29, %p22, %p28; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p30, %r54, %r33; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p31, %p30, %p29; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p32, %p27, %p31; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r34, %r53, %r32, %p32; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r35, %r54, %r33, %p32; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r36, %r34, 1, 31, -1; + shfl.sync.bfly.b32 %r37, %r35, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p33, %r34, %r36; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p34, %r34, %r36; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p35, %r34, %r34; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p36, %r36, %r36; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p37, %r35, %r37; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r38, %r35, %r37, %p35; + selp.b32 %r39, %r38, %r37, %p36; + selp.b32 %r40, %r35, %r39, %p34; + selp.b32 %r41, %r40, %r37, %p37; + selp.b32 %r42, %r41, %r35, %p36; + selp.b32 %r43, %r42, %r41, %p35; + selp.b32 %r44, %r35, %r43, %p33; +$L__tmp4: + .loc 1 47 25 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:25 + mad.wide.s32 %rd48, %r31, 8, %rd18; + .loc 1 47 36 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:36 + mov.b32 %r45, global_smem; + add.s32 %r46, %r45, %r3; + st.shared.b32 [%r46], %r44; + bar.sync 0; + shl.b32 %r47, %r30, 2; + add.s32 %r48, %r45, %r47; + ld.shared.s32 %rd47, [%r48]; + and.b32 %r49, %r2, 192; + setp.eq.b32 %p38, %r49, 0; + and.pred %p19, %p38, %p20; + // begin inline asm + @%p19 st.global.b64 [ %rd48 + 0 ], { %rd47 }; + // end inline asm + .loc 1 47 4 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 119 +.b8 100 +.b8 104 +.b8 119 +.b8 108 +.b8 117 +.b8 54 +.b8 121 +.b8 98 +.b8 51 +.b8 119 +.b8 99 +.b8 119 +.b8 97 +.b8 122 +.b8 100 +.b8 110 +.b8 122 +.b8 109 +.b8 103 +.b8 122 +.b8 101 +.b8 119 +.b8 105 +.b8 101 +.b8 109 +.b8 118 +.b8 122 +.b8 110 +.b8 120 +.b8 118 +.b8 114 +.b8 114 +.b8 51 +.b8 53 +.b8 50 +.b8 53 +.b8 101 +.b8 111 +.b8 106 +.b8 117 +.b8 112 +.b8 113 +.b8 106 +.b8 108 +.b8 100 +.b8 111 +.b8 53 +.b8 112 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..1d634d8e0bbe57aa9fa3ced025140cebfe928835 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source @@ -0,0 +1,323 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 64 : i32 loc(#loc80) + %xoffset_2 = arith.constant 64 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<64x4xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<64x4xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4_i32 = arith.constant 4 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<64x4xf32>, tensor<64x4xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x4xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x4xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x4xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<64x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_23 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc95) + %tmp0_25 = tt.broadcast %tmp0_22 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc95) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<64x4xi64> loc(#loc95) + %tmp0_27 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96) + %tmp0_28 = arith.muli %tmp0_27, %x1_12 : tensor<64x1xi64> loc(#loc96) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc97) + %tmp0_30 = arith.addi %tmp0_26, %tmp0_29 : tensor<64x4xi64> loc(#loc97) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc98) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> loc(#loc98) + %tmp0_33 = tt.broadcast %r0_mask_19 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc99) + %tmp0_34 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc99) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<64x4xi1> loc(#loc99) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc100) + %tmp0_38 = tt.load %tmp0_32, %tmp0_35, %tmp0_37 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_4S_i32S64_4S_fp32S64_4S_i32S1_4S__(%_tmp2_16, %_tmp2_index_17, %tmp0_38, %r0_index_18) : (tensor<64x4xf32>, tensor<64x4xi32>, tensor<64x4xf32>, tensor<1x4xi32>) -> (tensor<64x4xf32>, tensor<64x4xi32>) loc(#loc24) + %_tmp2_39 = tt.broadcast %r0_mask_19 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc101) + %_tmp2_40 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc101) + %_tmp2_41 = arith.andi %_tmp2_39, %_tmp2_40 : tensor<64x4xi1> loc(#loc101) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_16 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc102) + %_tmp2_index_43 = tt.broadcast %r0_mask_19 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc103) + %_tmp2_index_44 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc103) + %_tmp2_index_45 = arith.andi %_tmp2_index_43, %_tmp2_index_44 : tensor<64x4xi1> loc(#loc103) + %_tmp2_index_46 = arith.select %_tmp2_index_45, %8#1, %_tmp2_index_17 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc104) + scf.yield %_tmp2_42, %_tmp2_index_46 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_4S_i32S64_4S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_4S_i32S64_4S_fp32S64_4S_i32S1_4S__(%a_value: tensor<64x4xf32> loc("a_value"(#loc35)), %a_index: tensor<64x4xi32> loc("a_index"(#loc35)), %b_value: tensor<64x4xf32> loc("b_value"(#loc35)), %b_index: tensor<1x4xi32> loc("b_index"(#loc35))) -> (tensor<64x4xf32>, tensor<64x4xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<64x4xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<64x4xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_4S__(%a_value) : (tensor<64x4xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<64x4xi1>, tensor<64x4xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<64x4xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<64x4xf32> loc(#loc113) + %mask_4 = arith.constant true loc(#loc114) + %mask_5 = arith.constant dense : tensor<64x4xi1> loc(#loc114) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<64x4xi1> loc(#loc114) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<64x4xi1> loc(#loc115) + %mask_8 = arith.ori %mask, %mask_7 : tensor<64x4xi1> loc(#loc129) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1> loc(#loc117) + %equal_10 = arith.ori %equal, %equal_9 : tensor<64x4xi1> loc(#loc130) + scf.yield %mask_8, %equal_10 : tensor<64x4xi1>, tensor<64x4xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<64x4xi1>, tensor<64x4xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = tt.broadcast %b_index : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc119) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<64x4xi32> loc(#loc119) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<64x4xi1> loc(#loc120) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<64x4xi1> loc(#loc121) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc51) + %3 = tt.broadcast %b_index : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc52) + %4 = arith.select %mask_3, %a_index, %3 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc52) + tt.return %2, %4 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc53) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x4xf32> loc(#loc54) + %6 = ub.poison : tensor<64x4xi32> loc(#loc54) + tt.return %5, %6 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_4S__(%x: tensor<64x4xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_4S__(%x) : (tensor<64x4xf32>) -> tensor<64x4xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_4S__(%x: tensor<64x4xf32> loc("x"(#loc59))) -> tensor<64x4xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<64x4xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<64x4xf32> loc(#loc61) + tt.return %4 : tensor<64x4xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x4xf32> loc(#loc63) + tt.return %5 : tensor<64x4xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_4S_i32S64_4S__(2,)cconstexpr_1_"(%value: tensor<64x4xf32> loc("value"(#loc68)), %index: tensor<64x4xi32> loc("index"(#loc68))) -> (tensor<64xf32>, tensor<64xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<64xf32>, tensor<64xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc71) + %2 = ub.poison : tensor<64xi32> loc(#loc71) + tt.return %1, %2 : tensor<64xf32>, tensor<64xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..b72d92f23462e89b8e5b9fdec578521cfa0fef73 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,218 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc1 = loc(unknown) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc85 = loc(callsite(#loc1 at #loc39)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_1 = arith.constant dense : tensor<64x4xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2147483647> : tensor<64x4xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc50) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc51) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc52) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc52) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc52) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc52) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc53) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc53) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc53) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc53) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc54) + %xmask_13 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc54) + %xmask_14 = arith.cmpi slt, %xindex_11, %xmask : tensor<64x1xi32, #blocked> loc(#loc54) + %xmask_15 = arith.cmpi slt, %xindex_12, %xmask_13 : tensor<64x1xi32, #blocked1> loc(#loc54) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc55) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc55) + %x0 = arith.extsi %xindex_11 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc56) + %x0_17 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc56) + %x0_18 = arith.remsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc56) + %x1 = arith.divsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc57) + %tmp0 = arith.muli %x0_18, %cst : tensor<64x1xi64, #blocked> loc(#loc58) + %tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc59) + %tmp0_20 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc60) + %tmp0_21 = arith.muli %tmp0_20, %x1 : tensor<64x1xi64, #blocked> loc(#loc60) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc61) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc62) + %tmp0_24 = tt.broadcast %xmask_14 : tensor<64x1xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc63) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c4_i32 iter_args(%_tmp2 = %cst_4, %_tmp2_index_25 = %cst_3) -> (tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc65) + %r0_index_26 = arith.addi %r0_index, %r0_base_16 : tensor<1x4xi32, #blocked> loc(#loc65) + %r0_mask = arith.cmpi slt, %r0_index_26, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc66) + %tmp0_27 = arith.extsi %r0_index_26 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc59) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x4xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc59) + %tmp0_29 = arith.addi %tmp0_28, %tmp0_19 : tensor<64x4xi64, #blocked> loc(#loc59) + %tmp0_30 = arith.addi %tmp0_29, %tmp0_22 : tensor<64x4xi64, #blocked> loc(#loc61) + %tmp0_31 = tt.addptr %tmp0_23, %tmp0_30 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> loc(#loc62) + %tmp0_32 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc63) + %tmp0_33 = arith.andi %tmp0_32, %tmp0_24 : tensor<64x4xi1, #blocked> loc(#loc63) + %tmp0_34 = tt.load %tmp0_31, %tmp0_33, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc67) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_34 : tensor<64x4xf32, #blocked> loc(#loc110) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_34 : tensor<64x4xf32, #blocked> loc(#loc111) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<64x4xf32, #blocked> loc(#loc90) + %b_isnan = arith.cmpf une, %tmp0_34, %tmp0_34 : tensor<64x4xf32, #blocked> loc(#loc91) + %mask_35 = arith.xori %b_isnan, %cst_1 : tensor<64x4xi1, #blocked> loc(#loc92) + %mask_36 = arith.andi %a_isnan, %mask_35 : tensor<64x4xi1, #blocked> loc(#loc93) + %mask_37 = arith.ori %mask, %mask_36 : tensor<64x4xi1, #blocked> loc(#loc112) + %equal_38 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1, #blocked> loc(#loc95) + %equal_39 = arith.ori %equal, %equal_38 : tensor<64x4xi1, #blocked> loc(#loc113) + %mask_40 = tt.broadcast %r0_index_26 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc97) + %mask_41 = arith.cmpi slt, %_tmp2_index_25, %mask_40 : tensor<64x4xi32, #blocked> loc(#loc97) + %mask_42 = arith.andi %equal_39, %mask_41 : tensor<64x4xi1, #blocked> loc(#loc98) + %mask_43 = arith.ori %mask_37, %mask_42 : tensor<64x4xi1, #blocked> loc(#loc99) + %5 = arith.select %mask_43, %_tmp2, %tmp0_34 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc80) + %6 = arith.select %mask_43, %_tmp2_index_25, %mask_40 : tensor<64x4xi1, #blocked>, tensor<64x4xi32, #blocked> loc(#loc81) + %_tmp2_44 = arith.select %tmp0_33, %5, %_tmp2 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc82) + %_tmp2_index_45 = arith.select %tmp0_33, %6, %_tmp2_index_25 : tensor<64x4xi1, #blocked>, tensor<64x4xi32, #blocked> loc(#loc83) + scf.yield %_tmp2_44, %_tmp2_index_45 : tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked> loc(#loc37) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc39)), %arg7: i32 loc(callsite(#loc1 at #loc39)), %arg8: f32 loc(callsite(#loc1 at #loc39)), %arg9: i32 loc(callsite(#loc1 at #loc39))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc114) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc115) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc100) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc101) + %mask_25 = arith.xori %b_isnan, %true : i1 loc(#loc102) + %mask_26 = arith.andi %a_isnan, %mask_25 : i1 loc(#loc103) + %mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc116) + %equal_28 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc104) + %equal_29 = arith.ori %equal, %equal_28 : i1 loc(#loc117) + %mask_30 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc105) + %mask_31 = arith.andi %equal_29, %mask_30 : i1 loc(#loc106) + %mask_32 = arith.ori %mask_27, %mask_31 : i1 loc(#loc107) + %5 = arith.select %mask_32, %arg6, %arg8 : f32 loc(#loc108) + %6 = arith.select %mask_32, %arg7, %arg9 : i32 loc(#loc109) + tt.reduce.return %5, %6 : f32, i32 loc(#loc84) + }) : (tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked>) -> (tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc84) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc86) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc41) + %2 = tt.addptr %1, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc41) + %3 = ttg.convert_layout %tmp2 : tensor<64x1xi32, #blocked> -> tensor<64x1xi32, #blocked1> loc(#loc42) + %4 = arith.extsi %3 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc42) + tt.store %2, %4, %xmask_15 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc50 = loc("xoffset"(#loc2)) +#loc51 = loc("xoffset"(#loc3)) +#loc52 = loc("xindex"(#loc4)) +#loc53 = loc("xindex"(#loc5)) +#loc54 = loc("xmask"(#loc6)) +#loc55 = loc("r0_base"(#loc7)) +#loc56 = loc("x0"(#loc8)) +#loc57 = loc("x1"(#loc9)) +#loc58 = loc("tmp0"(#loc10)) +#loc59 = loc("tmp0"(#loc11)) +#loc60 = loc("tmp0"(#loc12)) +#loc61 = loc("tmp0"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("_tmp2"(#loc16)) +#loc65 = loc("r0_index"(#loc17)) +#loc66 = loc("r0_mask"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("mask"(#loc20)) +#loc69 = loc("equal"(#loc22)) +#loc70 = loc("a_isnan"(#loc23)) +#loc71 = loc("b_isnan"(#loc24)) +#loc72 = loc("mask"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("equal"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc(callsite(#loc33 at #loc21)) +#loc81 = loc(callsite(#loc34 at #loc21)) +#loc82 = loc("_tmp2"(#loc35)) +#loc83 = loc("_tmp2_index"(#loc36)) +#loc84 = loc(callsite(#loc38 at #loc39)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc64)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc("equal"(#loc69)) +#loc90 = loc(callsite(#loc70 at #loc21)) +#loc91 = loc(callsite(#loc71 at #loc21)) +#loc92 = loc(callsite(#loc72 at #loc21)) +#loc93 = loc(callsite(#loc73 at #loc21)) +#loc94 = loc("mask"(#loc74)) +#loc95 = loc(callsite(#loc75 at #loc21)) +#loc96 = loc("equal"(#loc76)) +#loc97 = loc(callsite(#loc77 at #loc21)) +#loc98 = loc(callsite(#loc78 at #loc21)) +#loc99 = loc(callsite(#loc79 at #loc21)) +#loc100 = loc(callsite(#loc70 at #loc84)) +#loc101 = loc(callsite(#loc71 at #loc84)) +#loc102 = loc(callsite(#loc72 at #loc84)) +#loc103 = loc(callsite(#loc73 at #loc84)) +#loc104 = loc(callsite(#loc75 at #loc84)) +#loc105 = loc(callsite(#loc77 at #loc84)) +#loc106 = loc(callsite(#loc78 at #loc84)) +#loc107 = loc(callsite(#loc79 at #loc84)) +#loc108 = loc(callsite(#loc33 at #loc84)) +#loc109 = loc(callsite(#loc34 at #loc84)) +#loc110 = loc(callsite(#loc88 at #loc21)) +#loc111 = loc(callsite(#loc89 at #loc21)) +#loc112 = loc(callsite(#loc94 at #loc21)) +#loc113 = loc(callsite(#loc96 at #loc21)) +#loc114 = loc(callsite(#loc88 at #loc84)) +#loc115 = loc(callsite(#loc89 at #loc84)) +#loc116 = loc(callsite(#loc94 at #loc84)) +#loc117 = loc(callsite(#loc96 at #loc84)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0a6ac86584dd65d9ab16abb4d55b45dff1c163af --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,217 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc48 = loc("in_ptr0"(#loc)) +#loc49 = loc("out_ptr0"(#loc)) +#loc50 = loc("ks0"(#loc)) +#loc51 = loc("ks1"(#loc)) +#loc52 = loc("xnumel"(#loc)) +#loc53 = loc("r0_numel"(#loc)) +#loc54 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc54) + %cst = arith.constant dense : tensor<64x4xi1> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x4xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<64x4xi32> loc(#loc55) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<64x4xf32> loc(#loc56) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc57) + %xoffset_3 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc58) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc59) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc60) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc61) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc61) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc62) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc62) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc63) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc64) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc65) + %x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc65) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc65) + %x1 = arith.divsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc66) + %_tmp2_index_11:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c4_i32 iter_args(%_tmp2_12 = %_tmp2, %_tmp2_index_13 = %_tmp2_index) -> (tensor<64x4xf32>, tensor<64x4xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc68) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x4xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst_2 : tensor<1x4xi32> loc(#loc69) + %tmp0 = arith.muli %x0_10, %cst_1 : tensor<64x1xi64> loc(#loc70) + %tmp0_15 = arith.extsi %r0_index_14 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc71) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc71) + %tmp0_17 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc71) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<64x4xi64> loc(#loc71) + %tmp0_19 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc72) + %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<64x1xi64> loc(#loc72) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc73) + %tmp0_22 = arith.addi %tmp0_18, %tmp0_21 : tensor<64x4xi64> loc(#loc73) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc74) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> loc(#loc74) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc75) + %tmp0_26 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc75) + %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<64x4xi1> loc(#loc75) + %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc76) + %mask = arith.cmpf ogt, %_tmp2_12, %tmp0_28 : tensor<64x4xf32> loc(#loc118) + %equal = arith.cmpf oeq, %_tmp2_12, %tmp0_28 : tensor<64x4xf32> loc(#loc119) + %a_isnan = arith.cmpf une, %_tmp2_12, %_tmp2_12 : tensor<64x4xf32> loc(#loc98) + %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<64x4xf32> loc(#loc99) + %mask_29 = arith.xori %b_isnan, %cst : tensor<64x4xi1> loc(#loc100) + %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<64x4xi1> loc(#loc101) + %mask_31 = arith.ori %mask, %mask_30 : tensor<64x4xi1> loc(#loc120) + %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1> loc(#loc103) + %equal_33 = arith.ori %equal, %equal_32 : tensor<64x4xi1> loc(#loc121) + %mask_34 = tt.broadcast %r0_index_14 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc105) + %mask_35 = arith.cmpi slt, %_tmp2_index_13, %mask_34 : tensor<64x4xi32> loc(#loc105) + %mask_36 = arith.andi %equal_33, %mask_35 : tensor<64x4xi1> loc(#loc106) + %mask_37 = arith.ori %mask_31, %mask_36 : tensor<64x4xi1> loc(#loc107) + %4 = arith.select %mask_37, %_tmp2_12, %tmp0_28 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc89) + %5 = arith.select %mask_37, %_tmp2_index_13, %mask_34 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc90) + %_tmp2_38 = arith.select %tmp0_27, %4, %_tmp2_12 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc91) + %_tmp2_index_39 = arith.select %tmp0_27, %5, %_tmp2_index_13 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc92) + scf.yield %_tmp2_38, %_tmp2_index_39 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc42) + } loc(#loc95) + %0:2 = "tt.reduce"(%_tmp2_index_11#0, %_tmp2_index_11#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc122) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc123) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc108) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc109) + %mask_12 = arith.xori %b_isnan, %true : i1 loc(#loc110) + %mask_13 = arith.andi %a_isnan, %mask_12 : i1 loc(#loc111) + %mask_14 = arith.ori %mask, %mask_13 : i1 loc(#loc124) + %equal_15 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc112) + %equal_16 = arith.ori %equal, %equal_15 : i1 loc(#loc125) + %mask_17 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc113) + %mask_18 = arith.andi %equal_16, %mask_17 : i1 loc(#loc114) + %mask_19 = arith.ori %mask_14, %mask_18 : i1 loc(#loc115) + %4 = arith.select %mask_19, %arg6, %arg8 : f32 loc(#loc116) + %5 = arith.select %mask_19, %arg7, %arg9 : i32 loc(#loc117) + tt.reduce.return %4, %5 : f32, i32 loc(#loc93) + }) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc93) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc94) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc45) + %2 = tt.addptr %1, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc45) + %3 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc46) + tt.store %2, %3, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc46) + tt.return loc(#loc47) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc55 = loc("_tmp2_index"(#loc4)) +#loc56 = loc("_tmp2"(#loc5)) +#loc57 = loc("xoffset"(#loc6)) +#loc58 = loc("xoffset"(#loc7)) +#loc59 = loc("xindex"(#loc8)) +#loc60 = loc("xindex"(#loc9)) +#loc61 = loc("xindex"(#loc10)) +#loc62 = loc("xmask"(#loc11)) +#loc63 = loc("r0_base"(#loc12)) +#loc64 = loc("r0_base"(#loc13)) +#loc65 = loc("x0"(#loc14)) +#loc66 = loc("x1"(#loc15)) +#loc67 = loc("_tmp2"(#loc3)) +#loc68 = loc("r0_index"(#loc16)) +#loc69 = loc("r0_mask"(#loc17)) +#loc70 = loc("tmp0"(#loc18)) +#loc71 = loc("tmp0"(#loc19)) +#loc72 = loc("tmp0"(#loc20)) +#loc73 = loc("tmp0"(#loc21)) +#loc74 = loc("tmp0"(#loc22)) +#loc75 = loc("tmp0"(#loc23)) +#loc76 = loc("tmp0"(#loc24)) +#loc77 = loc("mask"(#loc25)) +#loc78 = loc("equal"(#loc27)) +#loc79 = loc("a_isnan"(#loc28)) +#loc80 = loc("b_isnan"(#loc29)) +#loc81 = loc("mask"(#loc30)) +#loc82 = loc("mask"(#loc31)) +#loc83 = loc("mask"(#loc32)) +#loc84 = loc("equal"(#loc33)) +#loc85 = loc("equal"(#loc34)) +#loc86 = loc("mask"(#loc35)) +#loc87 = loc("mask"(#loc36)) +#loc88 = loc("mask"(#loc37)) +#loc89 = loc(callsite(#loc38 at #loc26)) +#loc90 = loc(callsite(#loc39 at #loc26)) +#loc91 = loc("_tmp2"(#loc40)) +#loc92 = loc("_tmp2_index"(#loc41)) +#loc93 = loc(callsite(#loc43 at #loc2)) +#loc94 = loc("tmp2"(#loc44)) +#loc95 = loc("_tmp2_index"(#loc67)) +#loc96 = loc("mask"(#loc77)) +#loc97 = loc("equal"(#loc78)) +#loc98 = loc(callsite(#loc79 at #loc26)) +#loc99 = loc(callsite(#loc80 at #loc26)) +#loc100 = loc(callsite(#loc81 at #loc26)) +#loc101 = loc(callsite(#loc82 at #loc26)) +#loc102 = loc("mask"(#loc83)) +#loc103 = loc(callsite(#loc84 at #loc26)) +#loc104 = loc("equal"(#loc85)) +#loc105 = loc(callsite(#loc86 at #loc26)) +#loc106 = loc(callsite(#loc87 at #loc26)) +#loc107 = loc(callsite(#loc88 at #loc26)) +#loc108 = loc(callsite(#loc79 at #loc93)) +#loc109 = loc(callsite(#loc80 at #loc93)) +#loc110 = loc(callsite(#loc81 at #loc93)) +#loc111 = loc(callsite(#loc82 at #loc93)) +#loc112 = loc(callsite(#loc84 at #loc93)) +#loc113 = loc(callsite(#loc86 at #loc93)) +#loc114 = loc(callsite(#loc87 at #loc93)) +#loc115 = loc(callsite(#loc88 at #loc93)) +#loc116 = loc(callsite(#loc38 at #loc93)) +#loc117 = loc(callsite(#loc39 at #loc93)) +#loc118 = loc(callsite(#loc96 at #loc26)) +#loc119 = loc(callsite(#loc97 at #loc26)) +#loc120 = loc(callsite(#loc102 at #loc26)) +#loc121 = loc(callsite(#loc104 at #loc26)) +#loc122 = loc(callsite(#loc96 at #loc93)) +#loc123 = loc(callsite(#loc97 at #loc93)) +#loc124 = loc(callsite(#loc102 at #loc93)) +#loc125 = loc(callsite(#loc104 at #loc93)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4a395d92862260514c867f6809c88723486b4cb8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..5f65e88dac0e64f9e184dddc25ffc80932a4d68c Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4787571dad5bed9532fbcbab0e4c45a9b958637 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "fe376ba41d2f05ff2bb7c13c37b09b85f38cb341d26f28c38925cd0f032bd098", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..0dab98a4977699d57d754e2d65b2cfea4519628c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,266 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 32, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 31, !dbg !12 + %16 = zext nneg i32 %12 to i64, !dbg !13 + %17 = mul i64 %5, %16, !dbg !13 + %18 = icmp sgt i32 %8, 0, !dbg !14 + br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14 + +.lr.ph: ; preds = %11 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %17 + br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %15, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18 + %26 = add i32 %20, 32, !dbg !14 + %27 = icmp slt i32 %26, %8, !dbg !14 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %15, !dbg !15 + %31 = icmp slt i32 %30, %8, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18 + %narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow16 to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 32, !dbg !14 + %38 = icmp slt i32 %37, %8, !dbg !14 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11 + %.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset3 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset4 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset5 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %extelt.offset6 = lshr i64 %70, 32, !dbg !22 + %71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %72 = trunc i64 %70 to i32, !dbg !22 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22 + %77 = bitcast <2 x i32> %76 to i64, !dbg !22 + %78 = add i64 %70, %77, !dbg !26 + %79 = trunc i64 %78 to i32, !dbg !27 + %80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28 + %81 = and i32 %14, 32, !dbg !29 + %82 = icmp eq i32 %81, 0, !dbg !29 + %83 = and i32 %14, 63, !dbg !29 + %84 = icmp eq i32 %83, 0, !dbg !29 + %85 = and i1 %13, %84, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29 + %86 = icmp slt i64 %5, 2, !dbg !30 + %87 = icmp sgt i64 %5, 1, !dbg !31 + %88 = select i1 %87, i64 %5, i64 0, !dbg !32 + %89 = zext i1 %86 to i64, !dbg !33 + %90 = add i64 %88, %89, !dbg !34 + %91 = mul i64 %90, %16, !dbg !35 + %92 = add i64 %5, 1, !dbg !36 + %93 = add i64 %6, 127, !dbg !37 + %94 = sdiv i64 %93, 128, !dbg !38 + %95 = and i64 %93, 127, !dbg !42 + %.not = icmp ne i64 %95, 0, !dbg !42 + %96 = icmp slt i64 %93, 0, !dbg !43 + %narrow = and i1 %96, %.not, !dbg !44 + %97 = sext i1 %narrow to i64, !dbg !44 + %98 = add nsw i64 %94, %97, !dbg !44 + br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +.lr.ph14: ; preds = %._crit_edge, %119 + %99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ] + %100 = or disjoint i32 %99, %15, !dbg !46 + %101 = icmp slt i32 %100, %8, !dbg !47 + %102 = sext i32 %100 to i64, !dbg !48 + %103 = add i64 %91, %102, !dbg !48 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49 + %105 = and i1 %13, %101, !dbg !50 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51 + %110 = icmp slt i32 %100, %79, !dbg !52 + %sext7 = shl i64 %109, 32, !dbg !53 + %111 = ashr exact i64 %sext7, 32, !dbg !53 + %112 = select i1 %110, i64 %111, i64 %5, !dbg !53 + %113 = icmp slt i64 %112, 0, !dbg !54 + %114 = select i1 %113, i64 %92, i64 0, !dbg !55 + %115 = add i64 %114, %112, !dbg !55 + %116 = icmp slt i64 %115, 0, !dbg !56 + %117 = icmp sgt i64 %115, %98, !dbg !57 + %.not12 = or i1 %116, %117, !dbg !58 + %.not9 = and i1 %105, %.not12, !dbg !59 + br i1 %.not9, label %118, label %119, !dbg !59 + +118: ; preds = %.lr.ph14 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59 + unreachable, !dbg !59 + +119: ; preds = %.lr.ph14 + %sext = shl i64 %107, 32, !dbg !53 + %120 = ashr exact i64 %sext, 32, !dbg !53 + %121 = select i1 %110, i64 %120, i64 %5, !dbg !53 + %122 = icmp slt i64 %121, 0, !dbg !54 + %123 = select i1 %122, i64 %92, i64 0, !dbg !55 + %124 = trunc i64 %109 to i32, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61 + %126 = and i1 %82, %105, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62 + %127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63 + %128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63 + %129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63 + %130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64 + %131 = add i32 %99, 32, !dbg !45 + %132 = icmp slt i32 %131, %8, !dbg !45 + br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +._crit_edge15: ; preds = %119, %._crit_edge + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 35, column: 45, scope: !9) +!14 = !DILocation(line: 29, column: 40, scope: !9) +!15 = !DILocation(line: 30, column: 31, scope: !9) +!16 = !DILocation(line: 35, column: 41, scope: !9) +!17 = !DILocation(line: 35, column: 34, scope: !9) +!18 = !DILocation(line: 35, column: 50, scope: !9) +!19 = !DILocation(line: 31, column: 29, scope: !9) +!20 = !DILocation(line: 39, column: 48, scope: !9) +!21 = !DILocation(line: 28, column: 43, scope: !9) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 40, column: 25, scope: !9) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 41, column: 19, scope: !9) +!28 = !DILocation(line: 42, column: 25, scope: !9) +!29 = !DILocation(line: 42, column: 36, scope: !9) +!30 = !DILocation(line: 49, column: 60, scope: !9) +!31 = !DILocation(line: 49, column: 86, scope: !9) +!32 = !DILocation(line: 49, column: 77, scope: !9) +!33 = !DILocation(line: 49, scope: !9) +!34 = !DILocation(line: 49, column: 68, scope: !9) +!35 = !DILocation(line: 49, column: 45, scope: !9) +!36 = !DILocation(line: 55, column: 20, scope: !9) +!37 = !DILocation(line: 59, column: 94, scope: !9) +!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!41 = !DILocation(line: 59, column: 100, scope: !9) +!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41) +!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41) +!45 = !DILocation(line: 43, column: 40, scope: !9) +!46 = !DILocation(line: 44, column: 31, scope: !9) +!47 = !DILocation(line: 45, column: 29, scope: !9) +!48 = !DILocation(line: 49, column: 41, scope: !9) +!49 = !DILocation(line: 49, column: 34, scope: !9) +!50 = !DILocation(line: 49, column: 103, scope: !9) +!51 = !DILocation(line: 49, column: 93, scope: !9) +!52 = !DILocation(line: 52, column: 22, scope: !9) +!53 = !DILocation(line: 54, column: 37, scope: !9) +!54 = !DILocation(line: 57, column: 24, scope: !9) +!55 = !DILocation(line: 58, column: 39, scope: !9) +!56 = !DILocation(line: 59, column: 32, scope: !9) +!57 = !DILocation(line: 59, column: 50, scope: !9) +!58 = !DILocation(line: 59, column: 112, scope: !9) +!59 = !DILocation(line: 59, column: 130, scope: !9) +!60 = !DILocation(line: 50, column: 23, scope: !9) +!61 = !DILocation(line: 61, column: 29, scope: !9) +!62 = !DILocation(line: 61, column: 94, scope: !9) +!63 = !DILocation(line: 62, column: 29, scope: !9) +!64 = !DILocation(line: 62, column: 95, scope: !9) +!65 = !DILocation(line: 43, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e09aad5f426cef037307507f327d35731a98ee93 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,640 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 103, 101, 47, 99, 103, 101, 55, 112, 112, 118, 118, 54, 53, 104, 97, 115, 114, 119, 113, 51, 51, 97, 97, 53, 121, 106, 110, 116, 52, 116, 119, 100, 122, 118, 119, 51, 112, 97, 107, 50, 120, 52, 98, 117, 55, 110, 55, 120, 50, 104, 121, 101, 120, 109, 118, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<32>; + .reg .b32 %r<53>; + .reg .b64 %rd<103>; + .loc 1 18 0 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8]; + ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2]; +$L__tmp0: + .loc 1 22 28 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 25 37 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + .loc 1 35 45 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:45 + cvt.u64.u32 %rd1, %r13; + mul.lo.s64 %rd2, %rd18, %rd1; + .loc 1 29 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:29:40 + setp.lt.s32 %p2, %r12, 1; + mov.b64 %rd102, 0; + cvt.u32.u64 %r49, %rd1; + shl.b64 %rd100, %rd2, 2; + @%p2 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0]; + .loc 1 24 21 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:24:21 + setp.lt.u32 %p3, %r49, 32; + add.s64 %rd3, %rd13, %rd100; + @%p3 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 21 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:21 + mov.b32 %r51, 0; + mov.b64 %rd102, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:31:29 + add.s32 %r20, %r2, %r51; + setp.lt.s32 %p6, %r20, %r12; + .loc 1 35 34 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:34 + mad.wide.s32 %rd28, %r20, 4, %rd3; + .loc 1 35 50 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:50 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27; + // end inline asm + .loc 1 39 48 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:39:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd30, %r21; + add.s64 %rd102, %rd102, %rd30; + .loc 1 29 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:29:40 + add.s32 %r51, %r51, 32; + setp.lt.s32 %p7, %r51, %r12; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:40 + mov.b32 %r50, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 35 41 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:41 + add.s32 %r17, %r2, %r50; + .loc 1 35 34 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:34 + mad.wide.s32 %rd23, %r17, 4, %rd3; + .loc 1 35 50 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:50 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22; + // end inline asm + .loc 1 29 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:29:40 + add.s32 %r50, %r50, 32; + setp.lt.s32 %p5, %r50, %r12; + @%p5 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 24 21 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:24:21 + setp.lt.u32 %p10, %r49, 32; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + mov.b64 {_, %r24}, %rd102; + cvt.u32.u64 %r25, %rd102; + shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1; + cvt.u64.u32 %rd32, %r26; + cvt.u64.u32 %rd33, %r27; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + add.s64 %rd36, %rd102, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + mov.b64 {_, %r28}, %rd36; + cvt.u32.u64 %r29, %rd36; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1; + cvt.u64.u32 %rd37, %r30; + cvt.u64.u32 %rd38, %r31; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + mov.b64 {_, %r32}, %rd41; + cvt.u32.u64 %r33, %rd41; + shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1; + cvt.u64.u32 %rd42, %r34; + cvt.u64.u32 %rd43, %r35; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + add.s64 %rd46, %rd41, %rd45; + .loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + mov.b64 {_, %r36}, %rd46; + cvt.u32.u64 %r37, %rd46; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1; + cvt.u64.u32 %rd47, %r38; + cvt.u64.u32 %rd48, %r39; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + add.s64 %rd51, %rd46, %rd50; + .loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + mov.b64 {_, %r40}, %rd51; + cvt.u32.u64 %r41, %rd51; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1; + cvt.u64.u32 %rd52, %r42; + .loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ] + add.s64 %rd53, %rd51, %rd52; +$L__tmp2: + .loc 1 41 19 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:41:19 + cvt.u32.u64 %r22, %rd53; + .loc 1 42 25 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:42:25 + shl.b64 %rd54, %rd1, 2; + add.s64 %rd31, %rd15, %rd54; + .loc 1 42 36 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:42:36 + and.b32 %r44, %r1, 63; + setp.eq.b32 %p11, %r44, 0; + and.pred %p8, %p10, %p11; + // begin inline asm + @%p8 st.global.b32 [ %rd31 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:43:40 + @%p2 bra $L__BB0_11; +// %bb.7: // %.lr.ph14.preheader + .loc 1 0 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:40 + ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1]; + and.b32 %r8, %r1, 32; + setp.lt.s64 %p12, %rd18, 2; + setp.gt.s64 %p13, %rd18, 1; + selp.b64 %rd55, %rd18, 0, %p13; + selp.b64 %rd56, 1, 0, %p12; + add.s64 %rd57, %rd55, %rd56; + mul.lo.s64 %rd7, %rd57, %rd1; + add.s64 %rd8, %rd18, 1; + add.s64 %rd58, %rd19, 127; + shr.s64 %rd59, %rd58, 63; + shr.u64 %rd60, %rd59, 57; + add.s64 %rd61, %rd58, %rd60; + shr.s64 %rd62, %rd61, 7; + and.b64 %rd63, %rd58, 127; + setp.ne.b64 %p14, %rd63, 0; + setp.lt.s64 %p15, %rd58, 0; + and.pred %p16, %p15, %p14; + selp.b64 %rd64, -1, 0, %p16; + add.s64 %rd9, %rd62, %rd64; + mov.b32 %r52, 0; +$L__BB0_8: // %.lr.ph14 + // =>This Inner Loop Header: Depth=1 + .loc 1 45 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:45:29 + add.s32 %r10, %r2, %r52; + setp.lt.s32 %p20, %r10, %r12; + .loc 1 49 41 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:41 + cvt.s64.s32 %rd73, %r10; + add.s64 %rd10, %rd7, %rd73; + .loc 1 49 34 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:34 + shl.b64 %rd74, %rd10, 3; + add.s64 %rd67, %rd14, %rd74; + .loc 1 49 103 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:103 + and.pred %p18, %p10, %p20; + .loc 1 49 93 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:93 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69; + // end inline asm + .loc 1 52 22 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:52:22 + setp.lt.s32 %p21, %r10, %r22; + .loc 1 54 37 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:54:37 + cvt.s64.s32 %rd75, %rd70; + selp.b64 %rd76, %rd75, %rd18, %p21; + .loc 1 58 39 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:58:39 + shr.s64 %rd77, %rd76, 63; + and.b64 %rd78, %rd77, %rd8; + add.s64 %rd79, %rd78, %rd76; + .loc 1 59 32 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:32 + setp.lt.s64 %p22, %rd79, 0; + .loc 1 59 50 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:50 + setp.gt.s64 %p23, %rd79, %rd9; + .loc 1 59 112 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:112 + or.pred %p24, %p22, %p23; + .loc 1 59 130 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:130 + and.pred %p25, %p18, %p24; + not.pred %p26, %p25; + @%p26 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_8 Depth=1 + .loc 1 42 36 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:42:36 + setp.eq.b32 %p30, %r8, 0; + .loc 1 54 37 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:54:37 + cvt.s64.s32 %rd82, %rd66; + selp.b64 %rd83, %rd82, %rd18, %p21; + .loc 1 58 39 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:58:39 + shr.s64 %rd84, %rd83, 63; + and.b64 %rd85, %rd84, %rd8; + .loc 1 50 23 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:50:23 + cvt.u32.u64 %r47, %rd70; + .loc 1 59 130 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:130 + bar.sync 0; + .loc 1 61 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:61:29 + shl.b64 %rd86, %rd10, 2; + add.s64 %rd80, %rd16, %rd86; + .loc 1 61 94 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:61:94 + and.pred %p27, %p30, %p18; + // begin inline asm + @%p27 st.global.b32 [ %rd80 + 0 ], { %r47 }; + // end inline asm + .loc 1 62 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:62:29 + shl.b64 %rd87, %rd83, 2; + add.s64 %rd88, %rd17, %rd87; + shl.b64 %rd89, %rd85, 2; + add.s64 %rd90, %rd88, %rd89; + add.s64 %rd92, %rd90, %rd54; + add.s64 %rd81, %rd92, %rd100; + mov.b32 %r48, 1; + .loc 1 62 95 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:62:95 + // begin inline asm + @%p27 st.global.b32 [ %rd81 + 0 ], { %r48 }; + // end inline asm + .loc 1 43 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:43:40 + add.s32 %r52, %r52, 32; + setp.lt.s32 %p31, %r52, %r12; + @%p31 bra $L__BB0_8; +$L__BB0_11: // %._crit_edge15 + .loc 1 43 4 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:43:4 + ret; +$L__BB0_9: + .loc 1 59 130 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:130 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd94, assertFunc_0; + cvta.global.u64 %rd95, %rd94; + st.param.b64 [param3], %rd95; + mov.b64 %rd96, assertFile_0; + cvta.global.u64 %rd97, %rd96; + st.param.b64 [param1], %rd97; + mov.b64 %rd98, assertMessage_0; + cvta.global.u64 %rd99, %rd98; + st.param.b64 [param0], %rd99; + st.param.b64 [param4], 1; + st.param.b32 [param2], 59; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 281 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 103 +.b8 101 +.b8 55 +.b8 112 +.b8 112 +.b8 118 +.b8 118 +.b8 54 +.b8 53 +.b8 104 +.b8 97 +.b8 115 +.b8 114 +.b8 119 +.b8 113 +.b8 51 +.b8 51 +.b8 97 +.b8 97 +.b8 53 +.b8 121 +.b8 106 +.b8 110 +.b8 116 +.b8 52 +.b8 116 +.b8 119 +.b8 100 +.b8 122 +.b8 118 +.b8 119 +.b8 51 +.b8 112 +.b8 97 +.b8 107 +.b8 50 +.b8 120 +.b8 52 +.b8 98 +.b8 117 +.b8 55 +.b8 110 +.b8 55 +.b8 120 +.b8 50 +.b8 104 +.b8 121 +.b8 101 +.b8 120 +.b8 109 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 103 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..2660fdc5fd7b67d75efa2de2b1c1f7478c1a391e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source @@ -0,0 +1,379 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":18:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc79 = loc(unknown) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("out_ptr3"(#loc)) +#loc102 = loc("ks0"(#loc)) +#loc103 = loc("ks1"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc151 = loc("input"(#loc77)) +#loc152 = loc("a"(#loc82)) +#loc153 = loc("b"(#loc82)) +#loc154 = loc("a"(#loc86)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114) + %_tmp3 = arith.constant 0 : i64 loc(#loc115) + %_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c32_i32 = arith.constant 32 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118) + %tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119) + %tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119) + %tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119) + %tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc121) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc121) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123) + %tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc123) + %tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124) + %tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125) + %_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126) + %_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126) + %_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127) + scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23) + } loc(#loc116) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128) + %tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc28) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc29) + %c32_i32_13 = arith.constant 32 : i32 loc(#loc29) + %6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29) + %8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29) + %9 = ub.poison : i32 loc(#loc29) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132) + %tmp6 = arith.constant 1 : i32 loc(#loc133) + %tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133) + %tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133) + %tmp6_18 = arith.constant 1 : i32 loc(#loc134) + %tmp6_19 = arith.constant 1 : i32 loc(#loc134) + %tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134) + %tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134) + %tmp6_22 = arith.constant 1 : i32 loc(#loc135) + %tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135) + %tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135) + %tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136) + %tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136) + %tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137) + %tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137) + %tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138) + %tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138) + %tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139) + %tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139) + %tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139) + %tmp6_35 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc140) + %tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc140) + %tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141) + %tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141) + %tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142) + %tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142) + %tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142) + %tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc142) + %tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144) + %tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145) + %tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145) + %tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145) + %tmp12 = arith.constant 1 : i32 loc(#loc146) + %tmp12_46 = arith.constant 1 : i64 loc(#loc146) + %tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146) + %tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147) + %tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147) + %tmp14 = arith.constant 0 : i32 loc(#loc148) + %tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148) + %tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148) + %tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148) + %tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc49) + %10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49) + %11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49) + %12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49) + %c127_i32 = arith.constant 127 : i32 loc(#loc50) + %c127_i64 = arith.constant 127 : i64 loc(#loc50) + %13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50) + %14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51) + %c1_i32 = arith.constant 1 : i32 loc(#loc52) + %c1_i64 = arith.constant 1 : i64 loc(#loc52) + %15 = arith.addi %c1_i64, %14 : i64 loc(#loc52) + %16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53) + %17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53) + %18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54) + %19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55) + %20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55) + %true = arith.constant true loc(#loc56) + %cst = arith.constant dense : tensor<1x32xi1> loc(#loc56) + %21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56) + %22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57) + tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58) + %tmp17 = arith.constant 1 : i32 loc(#loc150) + %tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc60) + %23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc61) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc61) + %25 = arith.extui %24 : i1 to i32 loc(#loc61) + %26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61) + %c1_i32_57 = arith.constant 1 : i32 loc(#loc62) + %27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62) + %29 = arith.extui %28 : i1 to i64 loc(#loc63) + %30 = arith.muli %ks0, %29 : i64 loc(#loc63) + %31 = arith.extsi %26 : i32 to i64 loc(#loc64) + %32 = arith.addi %31, %30 : i64 loc(#loc64) + %33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65) + %35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65) + %36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66) + %37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66) + %38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66) + %39 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc67) + %40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc67) + %41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68) + %42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68) + tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr> loc(#loc69) + %43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70) + %44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70) + %45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70) + %46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71) + %47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71) + %48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71) + %49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72) + %50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72) + %51 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc73) + %52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc73) + %53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74) + %54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74) + %cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75) + tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr> loc(#loc75) + } loc(#loc29) + tt.return loc(#loc76) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78) + tt.reduce.return %2 : i64 loc(#loc78) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78) + tt.return %0 : tensor<1xi64> loc(#loc80) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc81) + tt.return %1 : tensor<1xi64> loc(#loc81) + } loc(#loc77) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc83) + tt.return %0 : i64 loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc85) + tt.return %1 : i64 loc(#loc85) + } loc(#loc82) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc155) + %quot_0 = arith.constant 128 : i64 loc(#loc155) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155) + %remainder = arith.constant 128 : i32 loc(#loc156) + %remainder_2 = arith.constant 128 : i64 loc(#loc156) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156) + %fixed = arith.constant 0 : i32 loc(#loc157) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157) + %fixed_6 = arith.constant 1 : i32 loc(#loc158) + %fixed_7 = arith.constant 1 : i64 loc(#loc158) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159) + %c0_i32 = arith.constant 0 : i32 loc(#loc92) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92) + %false = arith.constant false loc(#loc93) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc93) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94) + tt.return %3 : i64 loc(#loc95) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc96) + tt.return %4 : i64 loc(#loc96) + } loc(#loc86) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":36:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":44:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":45:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:60) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:86) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:77) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:45) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:103) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":50:23) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":52:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":54:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":55:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":56:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":58:39) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:94) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:100) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:50) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:122) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:112) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:110) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:130) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":60:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:47) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:81) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:72) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:40) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:29) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:104) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:94) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:53) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:62) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:105) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:95) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc106 = loc("xnumel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("_tmp3"(#loc10)) +#loc116 = loc("_tmp3"(#loc11)) +#loc117 = loc("r0_index"(#loc12)) +#loc118 = loc("r0_mask"(#loc13)) +#loc119 = loc("tmp0"(#loc14)) +#loc120 = loc("tmp0"(#loc15)) +#loc121 = loc("tmp0"(#loc16)) +#loc122 = loc("tmp0"(#loc17)) +#loc123 = loc("tmp0"(#loc18)) +#loc124 = loc("tmp1"(#loc19)) +#loc125 = loc("tmp4"(#loc20)) +#loc126 = loc("_tmp3"(#loc21)) +#loc127 = loc("_tmp3"(#loc22)) +#loc128 = loc("tmp3"(#loc24)) +#loc129 = loc("tmp3"(#loc25)) +#loc130 = loc("tmp5"(#loc26)) +#loc131 = loc("r0_index"(#loc30)) +#loc132 = loc("r0_mask"(#loc31)) +#loc133 = loc("tmp6"(#loc32)) +#loc134 = loc("tmp6"(#loc33)) +#loc135 = loc("tmp6"(#loc34)) +#loc136 = loc("tmp6"(#loc35)) +#loc137 = loc("tmp6"(#loc36)) +#loc138 = loc("tmp6"(#loc37)) +#loc139 = loc("tmp6"(#loc38)) +#loc140 = loc("tmp6"(#loc39)) +#loc141 = loc("tmp6"(#loc40)) +#loc142 = loc("tmp6"(#loc41)) +#loc143 = loc("tmp7"(#loc42)) +#loc144 = loc("tmp9"(#loc43)) +#loc145 = loc("tmp11"(#loc44)) +#loc146 = loc("tmp12"(#loc45)) +#loc147 = loc("tmp13"(#loc46)) +#loc148 = loc("tmp14"(#loc47)) +#loc149 = loc("tmp15"(#loc48)) +#loc150 = loc("tmp17"(#loc59)) +#loc155 = loc("quot"(#loc87)) +#loc156 = loc("remainder"(#loc88)) +#loc157 = loc("fixed"(#loc89)) +#loc158 = loc("fixed"(#loc90)) +#loc159 = loc("fixed"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f079a259b2a21d85fc5a386dfa80e494b812e588 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,270 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:25) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("out_ptr1"(#loc)) +#loc71 = loc("out_ptr2"(#loc)) +#loc72 = loc("out_ptr3"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("xnumel"(#loc)) +#loc76 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp3"(#loc18)) +#loc124 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc77) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79) + %r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79) + %r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81) + %tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc83) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83) + %tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84) + %tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc87) + %tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88) + %tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89) + %_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90) + scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16) + } loc(#loc85) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))): + %tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133) + tt.reduce.return %tmp3_33 : i64 loc(#loc123) + }) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20) + %tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92) + %tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93) + %tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93) + %1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24) + tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97) + %tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97) + %tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125) + %tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98) + %tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100) + %tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126) + %tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126) + %tmp6_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc102) + %tmp6_24 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc102) + %tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107) + %4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39) + %quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128) + %remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130) + %fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131) + %fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132) + %5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113) + %6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114) + %7 = arith.addi %6, %c1_i64 : i64 loc(#loc48) + %8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49) + %9 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc50) + %10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51) + %11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc54) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94) + %tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102) + %tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102) + %tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103) + %tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103) + %tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked> loc(#loc117) + %tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc117) + %tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118) + %tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118) + %tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107) + %tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119) + %tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119) + %tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120) + %tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120) + %13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61) + %14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49) + %15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62) + %16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63) + %17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64) + tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65) + %18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50) + tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr, #blocked1> loc(#loc66) + %19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51) + %20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52) + %21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54) + tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr, #blocked> loc(#loc20) + } loc(#loc55) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":29:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":30:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":36:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:95) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":45:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:86) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:77) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:68) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:45) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:103) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":52:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":54:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":55:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":56:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:94) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:100) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:58) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:40) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":44:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:93) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":50:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":57:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":58:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:32) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:112) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:110) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:130) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:94) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:4) +#loc77 = loc("xoffset"(#loc2)) +#loc78 = loc("xmask"(#loc3)) +#loc79 = loc("r0_base"(#loc4)) +#loc80 = loc("r0_mask"(#loc5)) +#loc81 = loc("tmp0"(#loc6)) +#loc82 = loc("tmp0"(#loc7)) +#loc83 = loc("tmp0"(#loc8)) +#loc84 = loc("tmp0"(#loc9)) +#loc85 = loc("_tmp3"(#loc10)) +#loc86 = loc("r0_index"(#loc11)) +#loc87 = loc("tmp0"(#loc12)) +#loc88 = loc("tmp1"(#loc13)) +#loc89 = loc("tmp4"(#loc14)) +#loc90 = loc("_tmp3"(#loc15)) +#loc92 = loc("tmp3"(#loc21)) +#loc93 = loc("tmp5"(#loc22)) +#loc94 = loc("r0_mask"(#loc25)) +#loc95 = loc("tmp6"(#loc26)) +#loc96 = loc("tmp6"(#loc27)) +#loc97 = loc("tmp6"(#loc28)) +#loc98 = loc("tmp6"(#loc29)) +#loc99 = loc("tmp6"(#loc30)) +#loc100 = loc("tmp6"(#loc31)) +#loc101 = loc("tmp6"(#loc32)) +#loc102 = loc("tmp6"(#loc33)) +#loc103 = loc("tmp6"(#loc34)) +#loc104 = loc("tmp9"(#loc35)) +#loc105 = loc("tmp11"(#loc36)) +#loc106 = loc("tmp12"(#loc37)) +#loc107 = loc("tmp13"(#loc38)) +#loc108 = loc("quot"(#loc40)) +#loc109 = loc("remainder"(#loc42)) +#loc110 = loc("fixed"(#loc43)) +#loc111 = loc("fixed"(#loc44)) +#loc112 = loc("fixed"(#loc45)) +#loc113 = loc(callsite(#loc46 at #loc41)) +#loc114 = loc(callsite(#loc47 at #loc41)) +#loc115 = loc(fused[#loc52, #loc53]) +#loc116 = loc("r0_index"(#loc56)) +#loc117 = loc("tmp6"(#loc57)) +#loc118 = loc("tmp7"(#loc58)) +#loc119 = loc("tmp14"(#loc59)) +#loc120 = loc("tmp15"(#loc60)) +#loc121 = loc(fused[#loc82, #loc81]) +#loc122 = loc(fused[#loc84, #loc78]) +#loc123 = loc(callsite(#loc17 at #loc91)) +#loc125 = loc(fused[#loc98, #loc99]) +#loc126 = loc(fused[#loc101, #loc100]) +#loc127 = loc(fused[#loc103, #loc78]) +#loc128 = loc(callsite(#loc108 at #loc41)) +#loc129 = loc(callsite(#loc109 at #loc41)) +#loc130 = loc(callsite(#loc110 at #loc41)) +#loc131 = loc(callsite(#loc111 at #loc41)) +#loc132 = loc(callsite(#loc112 at #loc41)) +#loc133 = loc(callsite(#loc19 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b174366245915df75ad564edf8888fa19061eb75 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,246 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:25) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("out_ptr3"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc93 = loc("tmp3"(#loc19)) +#loc126 = loc(callsite(#loc1 at #loc93)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1> loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79) + %xmask_3 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc79) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_6 = %cst_2) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83) + %r0_index_7 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc83) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc84) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85) + %tmp0_9 = arith.muli %ks0, %tmp0 : i64 loc(#loc85) + %tmp0_10 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86) + %tmp0_11 = tt.splat %tmp0_9 : i64 -> tensor<1x32xi64> loc(#loc123) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x32xi64> loc(#loc86) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc87) + %tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc87) + %tmp0_15 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc124) + %tmp0_16 = arith.andi %r0_mask_8, %tmp0_15 : tensor<1x32xi1> loc(#loc88) + %tmp0_17 = tt.load %tmp0_14, %tmp0_16, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc89) + %tmp1 = arith.extsi %tmp0_17 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90) + %tmp4 = arith.addi %_tmp3_6, %tmp1 : tensor<1x32xi64> loc(#loc91) + %_tmp3_18 = arith.select %tmp0_16, %tmp4, %_tmp3_6 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92) + scf.yield %_tmp3_18 : tensor<1x32xi64> loc(#loc17) + } loc(#loc82) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_6: i64 loc(callsite(#loc1 at #loc93)), %tmp3_7: i64 loc(callsite(#loc1 at #loc93))): + %tmp3_8 = arith.addi %tmp3_6, %tmp3_7 : i64 loc(#loc135) + tt.reduce.return %tmp3_8 : i64 loc(#loc125) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125) + %tmp3_5 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94) + %tmp5 = arith.trunci %tmp3_5 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %1, %tmp5, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc24) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96) + %r0_index_6 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x32xi32> loc(#loc97) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp6_8 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp6_9 = arith.extui %tmp6_8 : i1 to i64 loc(#loc100) + %tmp6_10 = arith.muli %ks0, %tmp6_9 : i64 loc(#loc100) + %tmp6_11 = arith.extui %tmp6 : i1 to i64 loc(#loc127) + %tmp6_12 = arith.addi %tmp6_11, %tmp6_10 : i64 loc(#loc101) + %tmp6_13 = arith.extsi %xoffset : i32 to i64 loc(#loc103) + %tmp6_14 = arith.muli %tmp6_13, %tmp6_12 : i64 loc(#loc103) + %tmp6_15 = arith.extsi %r0_index_6 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104) + %tmp6_16 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc128) + %tmp6_17 = arith.addi %tmp6_15, %tmp6_16 : tensor<1x32xi64> loc(#loc104) + %tmp6_18 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc105) + %tmp6_19 = tt.addptr %tmp6_18, %tmp6_17 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc105) + %tmp6_20 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc129) + %tmp6_21 = arith.andi %r0_mask_7, %tmp6_20 : tensor<1x32xi1> loc(#loc106) + %tmp6_22 = tt.load %tmp6_19, %tmp6_21, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc107) + %tmp7 = arith.trunci %tmp6_22 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109) + %tmp9_23 = arith.cmpi slt, %r0_index_6, %tmp9 : tensor<1x32xi32> loc(#loc109) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110) + %tmp11_24 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110) + %tmp11_25 = arith.select %tmp9_23, %tmp11, %tmp11_24 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112) + %tmp13_26 = arith.addi %tmp11_25, %tmp13 : tensor<1x32xi64> loc(#loc112) + %tmp14 = arith.cmpi slt, %tmp11_25, %cst_2 : tensor<1x32xi64> loc(#loc113) + %tmp15 = arith.select %tmp14, %tmp13_26, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114) + %2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45) + %3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46) + %quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130) + %remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132) + %fixed_27 = arith.subi %quot, %c1_i64 : i64 loc(#loc133) + %fixed_28 = arith.select %fixed, %fixed_27, %quot : i64 loc(#loc134) + %4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120) + %5 = arith.select %4, %fixed_28, %quot : i64 loc(#loc121) + %6 = arith.addi %5, %c1_i64 : i64 loc(#loc55) + %7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56) + %8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56) + %9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57) + %10 = arith.xori %tmp6_21, %cst_1 : tensor<1x32xi1> loc(#loc58) + %11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59) + tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc61) + %13 = tt.addptr %12, %tmp6_17 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc61) + tt.store %13, %tmp7, %tmp6_21 : tensor<1x32x!tt.ptr> loc(#loc62) + %14 = tt.splat %tmp6_13 : i64 -> tensor<1x32xi64> loc(#loc63) + %15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63) + %16 = arith.muli %ks0, %tmp6_13 : i64 loc(#loc64) + %17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122) + %18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc66) + %20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc66) + tt.store %20, %cst_0, %tmp6_21 : tensor<1x32x!tt.ptr> loc(#loc67) + } loc(#loc25) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":36:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":44:31) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:60) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:77) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:68) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:52) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:45) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:103) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:93) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":50:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":52:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":54:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":56:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":57:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":58:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:94) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:100) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:55) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:50) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:112) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:110) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:130) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:94) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:53) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:62) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:58) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:95) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_base"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("_tmp3"(#loc6)) +#loc83 = loc("r0_index"(#loc7)) +#loc84 = loc("r0_mask"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp1"(#loc14)) +#loc91 = loc("tmp4"(#loc15)) +#loc92 = loc("_tmp3"(#loc16)) +#loc94 = loc("tmp3"(#loc21)) +#loc95 = loc("tmp5"(#loc22)) +#loc96 = loc("r0_index"(#loc26)) +#loc97 = loc("r0_mask"(#loc27)) +#loc98 = loc("tmp6"(#loc28)) +#loc99 = loc("tmp6"(#loc29)) +#loc100 = loc("tmp6"(#loc30)) +#loc101 = loc("tmp6"(#loc31)) +#loc102 = loc("tmp6"(#loc32)) +#loc103 = loc("tmp6"(#loc33)) +#loc104 = loc("tmp6"(#loc34)) +#loc105 = loc("tmp6"(#loc35)) +#loc106 = loc("tmp6"(#loc36)) +#loc107 = loc("tmp6"(#loc37)) +#loc108 = loc("tmp7"(#loc38)) +#loc109 = loc("tmp9"(#loc39)) +#loc110 = loc("tmp11"(#loc40)) +#loc111 = loc("tmp12"(#loc41)) +#loc112 = loc("tmp13"(#loc42)) +#loc113 = loc("tmp14"(#loc43)) +#loc114 = loc("tmp15"(#loc44)) +#loc115 = loc("quot"(#loc47)) +#loc116 = loc("remainder"(#loc49)) +#loc117 = loc("fixed"(#loc50)) +#loc118 = loc("fixed"(#loc51)) +#loc119 = loc("fixed"(#loc52)) +#loc120 = loc(callsite(#loc53 at #loc48)) +#loc121 = loc(callsite(#loc54 at #loc48)) +#loc122 = loc(fused[#loc65, #loc64]) +#loc123 = loc(fused[#loc86, #loc85]) +#loc124 = loc(fused[#loc88, #loc79]) +#loc125 = loc(callsite(#loc18 at #loc93)) +#loc127 = loc(fused[#loc101, #loc102]) +#loc128 = loc(fused[#loc104, #loc103]) +#loc129 = loc(fused[#loc106, #loc79]) +#loc130 = loc(callsite(#loc115 at #loc48)) +#loc131 = loc(callsite(#loc116 at #loc48)) +#loc132 = loc(callsite(#loc117 at #loc48)) +#loc133 = loc(callsite(#loc118 at #loc48)) +#loc134 = loc(callsite(#loc119 at #loc48)) +#loc135 = loc(callsite(#loc20 at #loc125)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..beec3fa5a32eff013a0bab0800bb387fd59bbb0f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "07c7815d2ce5fa33e16044674f04a1dcbb415776e0b5f0da0149af801b6db42c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..8cb0fa7aa5263c084fbdf8a7610ae35580124a5a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1221 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc88 = loc(unknown) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc140 = loc("in_ptr0"(#loc)) +#loc141 = loc("out_ptr2"(#loc)) +#loc142 = loc("out_ptr3"(#loc)) +#loc143 = loc("xnumel"(#loc)) +#loc144 = loc("r0_numel"(#loc)) +#loc174 = loc("x"(#loc38)) +#loc175 = loc("idxs"(#loc38)) +#loc176 = loc("x"(#loc42)) +#loc177 = loc("idxs"(#loc42)) +#loc182 = loc("x"(#loc50)) +#loc183 = loc("idxs"(#loc50)) +#loc184 = loc("flip"(#loc50)) +#loc240 = loc("input"(#loc113)) +#loc241 = loc("a"(#loc117)) +#loc242 = loc("b"(#loc117)) +#loc244 = loc("x"(#loc122)) +#loc245 = loc("x"(#loc126)) +#loc246 = loc("input"(#loc135)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc145) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc146) + %xoffset = tt.get_program_id x : i32 loc(#loc147) + %xoffset_2 = arith.constant 32 : i32 loc(#loc148) + %xoffset_3 = arith.constant 32 : i32 loc(#loc148) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc148) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc149) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc150) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<32x1xi32> loc(#loc151) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<32x1xi32> loc(#loc151) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc152) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32> loc(#loc152) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc153) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc154) + %r0_offset = arith.constant 0 : i32 loc(#loc155) + %r0_mask = arith.constant true loc(#loc156) + %r0_mask_10 = arith.constant dense : tensor<32x16xi1> loc(#loc156) + %x0 = arith.constant 16 : i32 loc(#loc157) + %x0_11 = arith.constant 16 : i32 loc(#loc157) + %x0_12 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc157) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<32x1xi32> loc(#loc157) + %x1 = arith.constant 16 : i32 loc(#loc158) + %x1_14 = arith.constant 16 : i32 loc(#loc158) + %x1_15 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc158) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<32x1xi32> loc(#loc158) + %tmp0 = arith.constant 17 : i32 loc(#loc159) + %tmp0_17 = arith.constant 17 : i32 loc(#loc159) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc159) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc159) + %tmp0_20 = tt.broadcast %x0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc160) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc160) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<32x16xi32> loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc161) + %tmp0_24 = arith.constant 272 : i32 loc(#loc161) + %tmp0_25 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc161) + %tmp0_26 = arith.muli %tmp0_25, %x1_16 : tensor<32x1xi32> loc(#loc161) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc162) + %tmp0_28 = arith.addi %tmp0_22, %tmp0_27 : tensor<32x16xi32> loc(#loc162) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc163) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc163) + %tmp0_31 = arith.constant 0.000000e+00 : f32 loc(#loc164) + %tmp0_32 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc164) + %tmp0_33 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc164) + %tmp0_34 = arith.fptosi %tmp0_33 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc164) + %tmp0_35 = tt.load %tmp0_30, %tmp0_32, %tmp0_34 : tensor<32x16x!tt.ptr> loc(#loc164) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc165) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc166) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_35, %tmp4) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc23) + %tmp7 = arith.extsi %tmp0_35 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc167) + %tmp10 = arith.constant 0 : i32 loc(#loc168) + %tmp10_36 = arith.constant 0 : i64 loc(#loc168) + %tmp10_37 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc168) + %tmp10_38 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc168) + %tmp10_39 = arith.select %tmp10_38, %tmp7, %tmp10_37 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc168) + %tmp11 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_39) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc169) + %tmp11_40 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc170) + %tmp12 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc171) + %tmp13 = arith.trunci %tmp12 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc172) + %tmp14 = arith.trunci %tmp11_40 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc31) + %c16_i32_41 = arith.constant 16 : i32 loc(#loc31) + %cst = arith.constant dense<16> : tensor<32x1xi32> loc(#loc31) + %1 = arith.muli %cst, %xindex_7 : tensor<32x1xi32> loc(#loc31) + %2 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc32) + %3 = tt.broadcast %1 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc32) + %4 = arith.addi %2, %3 : tensor<32x16xi32> loc(#loc32) + %5 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc33) + %6 = tt.addptr %5, %4 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc33) + %7 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc34) + tt.store %6, %tmp13, %7 : tensor<32x16x!tt.ptr> loc(#loc34) + %8 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc35) + %9 = tt.addptr %8, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc35) + tt.store %9, %tmp14, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc38)), %idxs: tensor<32x16xi16> loc("idxs"(#loc38))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc40) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc41) + %5 = ub.poison : tensor<32x16xi32> loc(#loc41) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc41) + } loc(#loc38) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi16> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc49) + %2 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi16> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc199) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc200) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc201) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc201) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc205) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc206) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc206) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc217) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc217) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc218) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc223) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc224) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc225) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc226) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc227) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc228) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc229) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc230) + %cond_38 = arith.constant 0 : i32 loc(#loc231) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc236) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc237) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc238) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc239) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc113))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc114) + tt.return %0 : tensor<256x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc116) + tt.return %1 : tensor<256x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc117)), %b: i32 loc("b"(#loc117))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc118) + tt.return %0 : i32 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc120) + tt.return %1 : i32 loc(#loc120) + } loc(#loc117) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc113))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc243) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc114) + tt.return %0 : tensor<256x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc116) + tt.return %1 : tensor<256x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc122))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc123) + %false = arith.constant false loc(#loc124) + tt.return %false : i1 loc(#loc124) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc125) + tt.return %1 : i1 loc(#loc125) + } loc(#loc122) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc126))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc127) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc128) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc128) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc128) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc128) + tt.return %4 : tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %5 : tensor<32x16xi32> loc(#loc130) + } loc(#loc126) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc132) + %cst = arith.constant dense : tensor<1xi1> loc(#loc132) + tt.return %cst : tensor<1xi1> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc134) + tt.return %0 : tensor<1xi1> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc135))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc136) + tt.return %0 : tensor<32x16xi32> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc138) + tt.return %1 : tensor<32x16xi32> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc132) + tt.return %cst : tensor<32x16xi32> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc134) + tt.return %0 : tensor<32x16xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc135))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc136) + tt.return %0 : tensor<32x16xi16> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc138) + tt.return %1 : tensor<32x16xi16> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc132) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc132) + tt.return %cst : tensor<32x16xi16> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc134) + tt.return %0 : tensor<32x16xi16> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc49) + %3 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc113))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc114) + tt.return %0 : tensor<128x2xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc116) + tt.return %1 : tensor<128x2xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc49) + %4 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc113))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc114) + tt.return %0 : tensor<64x4xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc116) + tt.return %1 : tensor<64x4xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc247) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc49) + %5 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc113))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc114) + tt.return %0 : tensor<32x8xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc116) + tt.return %1 : tensor<32x8xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc113))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc114) + tt.reduce.return %2 : i64 loc(#loc114) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc114) + tt.return %0 : tensor<32xi64> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc116) + tt.return %1 : tensor<32xi64> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc117)), %b: i64 loc("b"(#loc117))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc118) + tt.return %0 : i64 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc120) + tt.return %1 : i64 loc(#loc120) + } loc(#loc117) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":40:33) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":46:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":47:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc145 = loc("xnumel"(#loc1)) +#loc146 = loc("r0_numel"(#loc2)) +#loc147 = loc("xoffset"(#loc3)) +#loc148 = loc("xoffset"(#loc4)) +#loc149 = loc("xindex"(#loc5)) +#loc150 = loc("xindex"(#loc6)) +#loc151 = loc("xindex"(#loc7)) +#loc152 = loc("xmask"(#loc8)) +#loc153 = loc("r0_index"(#loc9)) +#loc154 = loc("r0_index"(#loc10)) +#loc155 = loc("r0_offset"(#loc11)) +#loc156 = loc("r0_mask"(#loc12)) +#loc157 = loc("x0"(#loc13)) +#loc158 = loc("x1"(#loc14)) +#loc159 = loc("tmp0"(#loc15)) +#loc160 = loc("tmp0"(#loc16)) +#loc161 = loc("tmp0"(#loc17)) +#loc162 = loc("tmp0"(#loc18)) +#loc163 = loc("tmp0"(#loc19)) +#loc164 = loc("tmp0"(#loc20)) +#loc165 = loc("tmp2"(#loc21)) +#loc166 = loc("tmp4"(#loc22)) +#loc167 = loc("tmp7"(#loc24)) +#loc168 = loc("tmp10"(#loc25)) +#loc169 = loc("tmp11"(#loc26)) +#loc170 = loc("tmp11"(#loc27)) +#loc171 = loc("tmp12"(#loc28)) +#loc172 = loc("tmp13"(#loc29)) +#loc173 = loc("tmp14"(#loc30)) +#loc178 = loc("flip"(#loc43)) +#loc179 = loc("flip"(#loc44)) +#loc180 = loc("flip"(#loc45)) +#loc181 = loc("flip"(#loc46)) +#loc185 = loc("y"(#loc51)) +#loc186 = loc("right_mask"(#loc52)) +#loc187 = loc("right_mask"(#loc53)) +#loc188 = loc("left_mask"(#loc54)) +#loc189 = loc("ileft"(#loc55)) +#loc190 = loc("ileft"(#loc56)) +#loc191 = loc("ileft"(#loc57)) +#loc192 = loc("ileft"(#loc58)) +#loc193 = loc("iright"(#loc59)) +#loc194 = loc("iright"(#loc60)) +#loc195 = loc("iright"(#loc61)) +#loc196 = loc("iright"(#loc62)) +#loc197 = loc("ileft"(#loc63)) +#loc198 = loc("iright"(#loc64)) +#loc199 = loc("y_idx"(#loc65)) +#loc200 = loc("left_idx"(#loc66)) +#loc201 = loc("left_idx"(#loc67)) +#loc202 = loc("left_idx"(#loc68)) +#loc203 = loc("left_idx"(#loc69)) +#loc204 = loc("left_idx"(#loc70)) +#loc205 = loc("right_idx"(#loc71)) +#loc206 = loc("right_idx"(#loc72)) +#loc207 = loc("right_idx"(#loc73)) +#loc208 = loc("right_idx"(#loc74)) +#loc209 = loc("right_idx"(#loc75)) +#loc210 = loc("left_idx"(#loc76)) +#loc211 = loc("right_idx"(#loc77)) +#loc212 = loc("left_valid_mask"(#loc78)) +#loc213 = loc("right_valid_mask"(#loc79)) +#loc214 = loc("left_isnan"(#loc80)) +#loc215 = loc("right_isnan"(#loc81)) +#loc216 = loc("cond"(#loc82)) +#loc217 = loc("cond"(#loc85)) +#loc218 = loc("cond"(#loc86)) +#loc219 = loc("cond"(#loc87)) +#loc220 = loc("eq"(#loc89)) +#loc221 = loc("eq"(#loc92)) +#loc222 = loc("eq"(#loc93)) +#loc223 = loc("cond"(#loc94)) +#loc224 = loc("cond"(#loc95)) +#loc225 = loc("cond"(#loc96)) +#loc226 = loc("cond"(#loc97)) +#loc227 = loc("cond"(#loc98)) +#loc228 = loc("cond"(#loc99)) +#loc229 = loc("cond"(#loc100)) +#loc230 = loc("cond"(#loc101)) +#loc231 = loc("cond"(#loc102)) +#loc232 = loc("ret"(#loc103)) +#loc233 = loc("ret"(#loc104)) +#loc234 = loc("ret"(#loc105)) +#loc235 = loc("ret"(#loc106)) +#loc236 = loc("new_idxs"(#loc107)) +#loc237 = loc("new_idxs"(#loc108)) +#loc238 = loc("new_idxs"(#loc109)) +#loc239 = loc("new_idxs"(#loc110)) +#loc243 = loc("input"(#loc121)) +#loc247 = loc("flip"(#loc139)) +#loc248 = loc("cond"(#loc216)) +#loc249 = loc("cond"(#loc219)) +#loc250 = loc("eq"(#loc220)) +#loc251 = loc("eq"(#loc222)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5df293f3b60ae12e317a943a2ab9f82e5f600eb9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,841 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 1], [0, 2]], block = []}> +#linear1 = #ttg.linear<{register = [[2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[0, 1, 0], [1, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[0, 0, 1], [0, 1, 0]], block = []}> +#linear3 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("out_ptr2"(#loc)) +#loc79 = loc("out_ptr3"(#loc)) +#loc80 = loc("xnumel"(#loc)) +#loc81 = loc("r0_numel"(#loc)) +#loc99 = loc(callsite(#loc19 at #loc20)) +#loc105 = loc("ileft"(#loc28)) +#loc109 = loc("iright"(#loc33)) +#loc118 = loc("left_idx"(#loc42)) +#loc123 = loc("right_idx"(#loc47)) +#loc143 = loc("tmp11"(#loc67)) +#loc149 = loc(callsite(#loc24 at #loc99)) +#loc153 = loc(callsite(#loc1 at #loc143)) +#loc157 = loc(callsite(#loc105 at #loc149)) +#loc161 = loc(callsite(#loc109 at #loc149)) +#loc169 = loc(callsite(#loc118 at #loc149)) +#loc174 = loc(callsite(#loc123 at #loc149)) +#loc194 = loc(callsite(#loc1 at #loc157)) +#loc196 = loc(callsite(#loc1 at #loc161)) +#loc199 = loc(callsite(#loc1 at #loc169)) +#loc202 = loc(callsite(#loc1 at #loc174)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<32x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_1 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<272> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %cst_11 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc82) + %xoffset_12 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc83) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84) + %xindex_13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc84) + %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc84) + %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked> loc(#loc85) + %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc85) + %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<32x1xi32, #blocked> loc(#loc85) + %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<32x1xi32, #blocked1> loc(#loc85) + %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<32x1xi32, #blocked> loc(#loc86) + %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc86) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87) + %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87) + %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87) + %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87) + %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87) + %x0 = arith.remsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc88) + %x1 = arith.divsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc89) + %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90) + %tmp0_26 = tt.broadcast %x0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_29 = arith.muli %x1, %cst_6 : tensor<32x1xi32, #blocked> loc(#loc92) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc93) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<32x16xi32, #blocked> loc(#loc93) + %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc94) + %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc94) + %tmp0_34 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc95) + %tmp0_35 = tt.broadcast %xmask_20 : tensor<32x1xi1, #blocked1> -> tensor<32x16xi1, #blocked1> loc(#loc95) + %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<32x16x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<32x16xi16, #linear> loc(#loc97) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146) + %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146) + %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146) + %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146) + %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146) + %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146) + %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146) + %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146) + %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146) + %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146) + %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146) + %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146) + %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc147) + %flip_49 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #blocked> loc(#loc148) + %flip_50 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc148) + %y = tt.reshape %tmp0_36 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155) + %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155) + %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155) + %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_54 = arith.muli %y, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_57 = tt.broadcast %ileft_56 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_58 = arith.muli %y, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_61 = tt.broadcast %iright_60 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_62 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc164) + %ileft_63 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_64 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc165) + %iright_65 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16, #linear> -> tensor<256x2x1xi16, #linear1> loc(#loc166) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167) + %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc168) + %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<256x2x1xi16, #linear1> loc(#loc168) + %input = arith.extsi %left_idx_67 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc197) + %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_70 = tt.broadcast %left_idx_69 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172) + %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc173) + %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<256x2x1xi16, #linear1> loc(#loc173) + %input_73 = arith.extsi %right_idx_72 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc200) + %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_76 = tt.broadcast %right_idx_75 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_77 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc177) + %left_idx_78 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_79 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc178) + %right_idx_80 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc179) + %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc179) + %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc180) + %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<32x16xi32, #blocked> loc(#loc181) + %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_85 = arith.andi %eq, %cond_83 : tensor<32x16xi1, #blocked> loc(#loc182) + %cond_86 = arith.andi %eq_82, %cond_84 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_87 = arith.ori %cond, %cond_85 : tensor<32x16xi1, #blocked> loc(#loc183) + %cond_88 = arith.ori %cond_81, %cond_86 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_89 = arith.extui %cond_87 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc184) + %cond_90 = arith.extui %cond_88 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_91 = arith.xori %cond_89, %flip_49 : tensor<32x16xi32, #blocked> loc(#loc184) + %cond_92 = arith.xori %cond_90, %flip_50 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<32x16xi32, #blocked> loc(#loc185) + %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret = arith.xori %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc186) + %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc187) + %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<32x16xi32, #blocked> loc(#loc188) + %ret_97 = ttg.convert_layout %ret_96 : tensor<32x16xi32, #blocked> -> tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191) + %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<32x16xi32, #linear> loc(#loc191) + %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<32x16xi32, #linear> loc(#loc191) + %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc147) + %flip_103 = tt.reshape %flip_102 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc148) + %y_104 = tt.reshape %ret_96 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_109 = tt.broadcast %ileft_108 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_110 = arith.muli %y_104, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_113 = tt.broadcast %iright_112 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_114 = tt.reshape %ileft_109 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_115 = tt.reshape %iright_113 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_116 = tt.reshape %new_idxs_101 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_120 = tt.broadcast %left_idx_119 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_124 = tt.broadcast %right_idx_123 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_125 = tt.reshape %left_idx_120 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_126 = tt.reshape %right_idx_124 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_130 = arith.andi %eq_128, %cond_129 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_131 = arith.ori %cond_127, %cond_130 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_132 = arith.extui %cond_131 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_133 = arith.xori %cond_132, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_137 = arith.xori %ret_97, %ret_136 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<32x16xi32, #linear> loc(#loc191) + %y_141 = tt.reshape %ret_137 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_142 = arith.muli %y_141, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_145 = tt.broadcast %ileft_144 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_146 = arith.muli %y_141, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_149 = tt.broadcast %iright_148 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_150 = tt.reshape %ileft_145 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_151 = tt.reshape %iright_149 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_152 = tt.reshape %new_idxs_140 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_156 = tt.broadcast %left_idx_155 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_160 = tt.broadcast %right_idx_159 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_161 = tt.reshape %left_idx_156 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_162 = tt.reshape %right_idx_160 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_166 = arith.andi %eq_164, %cond_165 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_167 = arith.ori %cond_163, %cond_166 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_168 = arith.extui %cond_167 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_169 = arith.xori %cond_168, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_173 = arith.xori %ret_137, %ret_172 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<32x16xi32, #linear> loc(#loc191) + %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc147) + %flip_178 = tt.reshape %flip_177 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc148) + %y_179 = tt.reshape %ret_173 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154) + %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158) + %ileft_184 = tt.broadcast %ileft_183 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159) + %iright_185 = arith.muli %y_179, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160) + %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162) + %iright_188 = tt.broadcast %iright_187 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163) + %ileft_189 = tt.reshape %ileft_184 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_190 = tt.reshape %iright_188 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_191 = tt.reshape %new_idxs_176 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168) + %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170) + %left_idx_195 = tt.broadcast %left_idx_194 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171) + %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173) + %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175) + %right_idx_199 = tt.broadcast %right_idx_198 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176) + %left_idx_200 = tt.reshape %left_idx_195 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_201 = tt.reshape %right_idx_199 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_205 = arith.andi %eq_203, %cond_204 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_206 = arith.ori %cond_202, %cond_205 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_207 = arith.extui %cond_206 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_208 = arith.xori %cond_207, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_212 = arith.xori %ret_173, %ret_211 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<32x16xi32, #linear> loc(#loc191) + %y_216 = tt.reshape %ret_212 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_220 = tt.broadcast %ileft_219 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_221 = arith.muli %y_216, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_224 = tt.broadcast %iright_223 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_225 = tt.reshape %ileft_220 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_226 = tt.reshape %iright_224 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_227 = tt.reshape %new_idxs_215 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_231 = tt.broadcast %left_idx_230 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_235 = tt.broadcast %right_idx_234 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_236 = tt.reshape %left_idx_231 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_237 = tt.reshape %right_idx_235 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_241 = arith.andi %eq_239, %cond_240 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_242 = arith.ori %cond_238, %cond_241 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_243 = arith.extui %cond_242 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_244 = arith.xori %cond_243, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_248 = arith.xori %ret_212, %ret_247 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<32x16xi32, #linear> loc(#loc191) + %y_252 = tt.reshape %ret_248 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_253 = arith.muli %y_252, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_256 = tt.broadcast %ileft_255 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_257 = arith.muli %y_252, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_260 = tt.broadcast %iright_259 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_261 = tt.reshape %ileft_256 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_262 = tt.reshape %iright_260 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_263 = tt.reshape %new_idxs_251 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_267 = tt.broadcast %left_idx_266 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_271 = tt.broadcast %right_idx_270 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_272 = tt.reshape %left_idx_267 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_273 = tt.reshape %right_idx_271 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_277 = arith.andi %eq_275, %cond_276 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_278 = arith.ori %cond_274, %cond_277 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_279 = arith.extui %cond_278 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_280 = arith.xori %cond_279, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_284 = arith.xori %ret_248, %ret_283 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<32x16xi32, #linear> loc(#loc191) + %y_288 = tt.reshape %ret_284 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc154) + %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc156) + %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc156) + %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193) + %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc158) + %ileft_293 = tt.broadcast %ileft_292 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc159) + %iright_294 = arith.muli %y_288, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc160) + %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195) + %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc162) + %iright_297 = tt.broadcast %iright_296 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc163) + %ileft_298 = tt.reshape %ileft_293 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_299 = tt.reshape %iright_297 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_300 = tt.reshape %new_idxs_287 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc166) + %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc168) + %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198) + %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc170) + %left_idx_304 = tt.broadcast %left_idx_303 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc171) + %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc173) + %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201) + %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc175) + %right_idx_308 = tt.broadcast %right_idx_307 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc176) + %left_idx_309 = tt.reshape %left_idx_304 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_310 = tt.reshape %right_idx_308 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_314 = arith.andi %eq_312, %cond_313 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_315 = arith.ori %cond_311, %cond_314 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_318 = arith.xori %ret_284, %ret_317 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<32x16xi32, #linear> loc(#loc191) + %y_322 = tt.reshape %ret_318 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154) + %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158) + %ileft_326 = tt.broadcast %ileft_325 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159) + %iright_327 = arith.muli %y_322, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160) + %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162) + %iright_330 = tt.broadcast %iright_329 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163) + %ileft_331 = tt.reshape %ileft_326 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_332 = tt.reshape %iright_330 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_333 = tt.reshape %new_idxs_321 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168) + %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170) + %left_idx_337 = tt.broadcast %left_idx_336 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171) + %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173) + %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175) + %right_idx_341 = tt.broadcast %right_idx_340 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176) + %left_idx_342 = tt.reshape %left_idx_337 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_343 = tt.reshape %right_idx_341 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_347 = arith.andi %eq_345, %cond_346 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_348 = arith.ori %cond_344, %cond_347 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_351 = arith.xori %ret_318, %ret_350 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<32x16xi32, #linear> loc(#loc191) + %y_355 = tt.reshape %ret_351 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_359 = tt.broadcast %ileft_358 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_360 = arith.muli %y_355, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_363 = tt.broadcast %iright_362 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_364 = tt.reshape %ileft_359 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_365 = tt.reshape %iright_363 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_366 = tt.reshape %new_idxs_354 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_370 = tt.broadcast %left_idx_369 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_374 = tt.broadcast %right_idx_373 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_375 = tt.reshape %left_idx_370 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_376 = tt.reshape %right_idx_374 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_380 = arith.andi %eq_378, %cond_379 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_381 = arith.ori %cond_377, %cond_380 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_384 = arith.xori %ret_351, %ret_383 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<32x16xi32, #linear> loc(#loc191) + %y_388 = tt.reshape %ret_384 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_389 = arith.muli %y_388, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_392 = tt.broadcast %ileft_391 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_393 = arith.muli %y_388, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_396 = tt.broadcast %iright_395 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_397 = tt.reshape %ileft_392 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_398 = tt.reshape %iright_396 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_399 = tt.reshape %new_idxs_387 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_403 = tt.broadcast %left_idx_402 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_407 = tt.broadcast %right_idx_406 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_408 = tt.reshape %left_idx_403 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_409 = tt.reshape %right_idx_407 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_413 = arith.andi %eq_411, %cond_412 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_414 = arith.ori %cond_410, %cond_413 : tensor<32x16xi1, #linear> loc(#loc183) + %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<32x16xi32, #linear> loc(#loc191) + %tmp7 = arith.extsi %tmp0_36 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc141) + %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc142) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))): + %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192) + tt.reduce.return %tmp11_421 : i64 loc(#loc152) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152) + %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc144) + %tmp14 = arith.trunci %tmp11_418 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc145) + %0 = arith.muli %xindex_19, %cst_4 : tensor<32x1xi32, #blocked1> loc(#loc70) + %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71) + %2 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71) + %3 = arith.addi %1, %2 : tensor<32x16xi32, #blocked1> loc(#loc71) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked1> loc(#loc72) + %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr, #blocked1>, tensor<32x16xi32, #blocked1> loc(#loc72) + %6 = ttg.convert_layout %new_idxs_417 : tensor<32x16xi32, #linear> -> tensor<32x16xi32, #blocked1> loc(#loc73) + tt.store %5, %6, %tmp0_35 : tensor<32x16x!tt.ptr, #blocked1> loc(#loc73) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc74) + %8 = tt.addptr %7, %xindex_18 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc74) + tt.store %8, %tmp14, %xmask : tensor<32x1x!tt.ptr, #blocked> loc(#loc75) + tt.return loc(#loc76) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":40:33) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4) +#loc82 = loc("xoffset"(#loc2)) +#loc83 = loc("xoffset"(#loc3)) +#loc84 = loc("xindex"(#loc4)) +#loc85 = loc("xindex"(#loc5)) +#loc86 = loc("xmask"(#loc6)) +#loc87 = loc("r0_index"(#loc7)) +#loc88 = loc("x0"(#loc8)) +#loc89 = loc("x1"(#loc9)) +#loc90 = loc("tmp0"(#loc10)) +#loc91 = loc("tmp0"(#loc11)) +#loc92 = loc("tmp0"(#loc12)) +#loc93 = loc("tmp0"(#loc13)) +#loc94 = loc("tmp0"(#loc14)) +#loc95 = loc("tmp0"(#loc15)) +#loc96 = loc("tmp2"(#loc16)) +#loc97 = loc("tmp4"(#loc17)) +#loc98 = loc("flip"(#loc18)) +#loc100 = loc("flip"(#loc21)) +#loc101 = loc("flip"(#loc22)) +#loc102 = loc("y"(#loc23)) +#loc103 = loc("left_mask"(#loc25)) +#loc104 = loc("ileft"(#loc26)) +#loc106 = loc("ileft"(#loc30)) +#loc107 = loc("ileft"(#loc31)) +#loc108 = loc("iright"(#loc32)) +#loc110 = loc("iright"(#loc34)) +#loc111 = loc("iright"(#loc35)) +#loc112 = loc("ileft"(#loc36)) +#loc113 = loc("iright"(#loc37)) +#loc114 = loc("y_idx"(#loc38)) +#loc115 = loc("left_idx"(#loc39)) +#loc116 = loc("left_idx"(#loc40)) +#loc117 = loc("input"(#loc41)) +#loc119 = loc("left_idx"(#loc43)) +#loc120 = loc("left_idx"(#loc44)) +#loc121 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc46)) +#loc124 = loc("right_idx"(#loc48)) +#loc125 = loc("right_idx"(#loc49)) +#loc126 = loc("left_idx"(#loc50)) +#loc127 = loc("right_idx"(#loc51)) +#loc128 = loc("cond"(#loc52)) +#loc129 = loc("eq"(#loc53)) +#loc130 = loc("cond"(#loc54)) +#loc131 = loc("cond"(#loc55)) +#loc132 = loc("cond"(#loc56)) +#loc133 = loc("cond"(#loc57)) +#loc134 = loc("cond"(#loc58)) +#loc135 = loc("ret"(#loc59)) +#loc136 = loc("ret"(#loc60)) +#loc137 = loc("ret"(#loc61)) +#loc138 = loc("new_idxs"(#loc62)) +#loc139 = loc("new_idxs"(#loc63)) +#loc140 = loc("new_idxs"(#loc64)) +#loc141 = loc("tmp7"(#loc65)) +#loc142 = loc("tmp10"(#loc66)) +#loc144 = loc("tmp11"(#loc68)) +#loc145 = loc("tmp14"(#loc69)) +#loc146 = loc(callsite(#loc98 at #loc99)) +#loc147 = loc(callsite(#loc100 at #loc99)) +#loc148 = loc(callsite(#loc101 at #loc99)) +#loc150 = loc("cond"(#loc128)) +#loc151 = loc("eq"(#loc129)) +#loc152 = loc(callsite(#loc27 at #loc143)) +#loc154 = loc(callsite(#loc102 at #loc149)) +#loc155 = loc(callsite(#loc103 at #loc149)) +#loc156 = loc(callsite(#loc104 at #loc149)) +#loc158 = loc(callsite(#loc106 at #loc149)) +#loc159 = loc(callsite(#loc107 at #loc149)) +#loc160 = loc(callsite(#loc108 at #loc149)) +#loc162 = loc(callsite(#loc110 at #loc149)) +#loc163 = loc(callsite(#loc111 at #loc149)) +#loc164 = loc(callsite(#loc112 at #loc149)) +#loc165 = loc(callsite(#loc113 at #loc149)) +#loc166 = loc(callsite(#loc114 at #loc149)) +#loc167 = loc(callsite(#loc115 at #loc149)) +#loc168 = loc(callsite(#loc116 at #loc149)) +#loc170 = loc(callsite(#loc119 at #loc149)) +#loc171 = loc(callsite(#loc120 at #loc149)) +#loc172 = loc(callsite(#loc121 at #loc149)) +#loc173 = loc(callsite(#loc122 at #loc149)) +#loc175 = loc(callsite(#loc124 at #loc149)) +#loc176 = loc(callsite(#loc125 at #loc149)) +#loc177 = loc(callsite(#loc126 at #loc149)) +#loc178 = loc(callsite(#loc127 at #loc149)) +#loc179 = loc(callsite(#loc150 at #loc149)) +#loc180 = loc(callsite(#loc151 at #loc149)) +#loc181 = loc(callsite(#loc130 at #loc149)) +#loc182 = loc(callsite(#loc131 at #loc149)) +#loc183 = loc(callsite(#loc132 at #loc149)) +#loc184 = loc(callsite(#loc133 at #loc149)) +#loc185 = loc(callsite(#loc134 at #loc149)) +#loc186 = loc(callsite(#loc135 at #loc149)) +#loc187 = loc(callsite(#loc136 at #loc149)) +#loc188 = loc(callsite(#loc137 at #loc149)) +#loc189 = loc(callsite(#loc138 at #loc149)) +#loc190 = loc(callsite(#loc139 at #loc149)) +#loc191 = loc(callsite(#loc140 at #loc149)) +#loc192 = loc(callsite(#loc29 at #loc152)) +#loc193 = loc(callsite(#loc27 at #loc157)) +#loc195 = loc(callsite(#loc27 at #loc161)) +#loc197 = loc(callsite(#loc117 at #loc169)) +#loc198 = loc(callsite(#loc27 at #loc169)) +#loc200 = loc(callsite(#loc117 at #loc174)) +#loc201 = loc(callsite(#loc27 at #loc174)) +#loc203 = loc(callsite(#loc29 at #loc193)) +#loc204 = loc(callsite(#loc29 at #loc195)) +#loc205 = loc(callsite(#loc29 at #loc198)) +#loc206 = loc(callsite(#loc29 at #loc201)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e126905132585632762429c115da3c5521de9136 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,799 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("out_ptr3"(#loc)) +#loc83 = loc("xnumel"(#loc)) +#loc84 = loc("r0_numel"(#loc)) +#loc106 = loc(callsite(#loc23 at #loc2)) +#loc113 = loc("ileft"(#loc32)) +#loc117 = loc("iright"(#loc37)) +#loc126 = loc("left_idx"(#loc46)) +#loc131 = loc("right_idx"(#loc51)) +#loc150 = loc("tmp11"(#loc70)) +#loc157 = loc(callsite(#loc28 at #loc106)) +#loc161 = loc(callsite(#loc1 at #loc150)) +#loc165 = loc(callsite(#loc113 at #loc157)) +#loc169 = loc(callsite(#loc117 at #loc157)) +#loc177 = loc(callsite(#loc126 at #loc157)) +#loc182 = loc(callsite(#loc131 at #loc157)) +#loc202 = loc(callsite(#loc1 at #loc165)) +#loc204 = loc(callsite(#loc1 at #loc169)) +#loc207 = loc(callsite(#loc1 at #loc177)) +#loc210 = loc(callsite(#loc1 at #loc182)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85) + %cst_0 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc86) + %tmp0 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc87) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88) + %cst_2 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc89) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc91) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc92) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc93) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<32x1xi32> loc(#loc94) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<32x1xi32> loc(#loc94) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<32x1xi32> loc(#loc89) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96) + %x0 = arith.remsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc97) + %x1 = arith.divsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc98) + %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88) + %tmp0_10 = tt.broadcast %x0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc99) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc99) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<32x16xi32> loc(#loc99) + %tmp0_13 = arith.muli %x1, %tmp0 : tensor<32x1xi32> loc(#loc87) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc100) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<32x16xi32> loc(#loc100) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc101) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc101) + %tmp0_18 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc102) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<32x16x!tt.ptr> loc(#loc102) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc104) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153) + %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154) + %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154) + %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc155) + %flip_23 = tt.reshape %flip_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc156) + %y = tt.reshape %tmp0_19 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc164) + %ileft_24 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_27 = tt.broadcast %ileft_26 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc168) + %iright_28 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_31 = tt.broadcast %iright_30 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_32 = tt.reshape %ileft_27 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_33 = tt.reshape %iright_31 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc174) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175) + %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc176) + %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<256x2x1xi16> loc(#loc176) + %input = arith.extsi %left_idx_35 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc205) + %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_38 = tt.broadcast %left_idx_37 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180) + %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc181) + %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<256x2x1xi16> loc(#loc181) + %input_41 = arith.extsi %right_idx_40 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc208) + %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_44 = tt.broadcast %right_idx_43 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_45 = tt.reshape %left_idx_38 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_46 = tt.reshape %right_idx_44 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc187) + %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc188) + %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc189) + %cond_48 = arith.andi %eq, %cond_47 : tensor<32x16xi1> loc(#loc190) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc191) + %cond_50 = arith.extui %cond_49 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_51 = arith.xori %cond_50, %flip_23 : tensor<32x16xi32> loc(#loc192) + %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret = arith.xori %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc194) + %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<32x16xi32> loc(#loc196) + %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc197) + %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199) + %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc199) + %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<32x16xi32> loc(#loc199) + %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc155) + %flip_60 = tt.reshape %flip_59 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc156) + %y_61 = tt.reshape %ret_54 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc164) + %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_66 = tt.broadcast %ileft_65 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_67 = arith.muli %y_61, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_70 = tt.broadcast %iright_69 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_71 = tt.reshape %ileft_66 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_72 = tt.reshape %iright_70 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_73 = tt.reshape %new_idxs_58 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_77 = tt.broadcast %left_idx_76 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_81 = tt.broadcast %right_idx_80 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_82 = tt.reshape %left_idx_77 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_83 = tt.reshape %right_idx_81 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc187) + %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc188) + %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc189) + %cond_87 = arith.andi %eq_85, %cond_86 : tensor<32x16xi1> loc(#loc190) + %cond_88 = arith.ori %cond_84, %cond_87 : tensor<32x16xi1> loc(#loc191) + %cond_89 = arith.extui %cond_88 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_90 = arith.xori %cond_89, %flip_60 : tensor<32x16xi32> loc(#loc192) + %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc194) + %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_94 = arith.xori %ret_54, %ret_93 : tensor<32x16xi32> loc(#loc196) + %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc197) + %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<32x16xi32> loc(#loc199) + %y_98 = tt.reshape %ret_94 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_99 = arith.muli %y_98, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_102 = tt.broadcast %ileft_101 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_103 = arith.muli %y_98, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_106 = tt.broadcast %iright_105 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_107 = tt.reshape %ileft_102 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_108 = tt.reshape %iright_106 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_109 = tt.reshape %new_idxs_97 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_113 = tt.broadcast %left_idx_112 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_117 = tt.broadcast %right_idx_116 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_118 = tt.reshape %left_idx_113 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_119 = tt.reshape %right_idx_117 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc187) + %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc188) + %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc189) + %cond_123 = arith.andi %eq_121, %cond_122 : tensor<32x16xi1> loc(#loc190) + %cond_124 = arith.ori %cond_120, %cond_123 : tensor<32x16xi1> loc(#loc191) + %cond_125 = arith.extui %cond_124 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_126 = arith.xori %cond_125, %flip_60 : tensor<32x16xi32> loc(#loc192) + %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc194) + %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_130 = arith.xori %ret_94, %ret_129 : tensor<32x16xi32> loc(#loc196) + %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc197) + %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<32x16xi32> loc(#loc199) + %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc155) + %flip_135 = tt.reshape %flip_134 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc156) + %y_136 = tt.reshape %ret_130 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162) + %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc164) + %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<64x2x4xi32> loc(#loc164) + %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201) + %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166) + %ileft_141 = tt.broadcast %ileft_140 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167) + %iright_142 = arith.muli %y_136, %flip_59 : tensor<64x2x4xi32> loc(#loc168) + %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203) + %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170) + %iright_145 = tt.broadcast %iright_144 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171) + %ileft_146 = tt.reshape %ileft_141 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_147 = tt.reshape %iright_145 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_148 = tt.reshape %new_idxs_133 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174) + %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<64x2x4xi32> loc(#loc176) + %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206) + %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178) + %left_idx_152 = tt.broadcast %left_idx_151 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179) + %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<64x2x4xi32> loc(#loc181) + %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209) + %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183) + %right_idx_156 = tt.broadcast %right_idx_155 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184) + %left_idx_157 = tt.reshape %left_idx_152 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_158 = tt.reshape %right_idx_156 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc187) + %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc188) + %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc189) + %cond_162 = arith.andi %eq_160, %cond_161 : tensor<32x16xi1> loc(#loc190) + %cond_163 = arith.ori %cond_159, %cond_162 : tensor<32x16xi1> loc(#loc191) + %cond_164 = arith.extui %cond_163 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_165 = arith.xori %cond_164, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc194) + %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_169 = arith.xori %ret_130, %ret_168 : tensor<32x16xi32> loc(#loc196) + %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc197) + %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<32x16xi32> loc(#loc199) + %y_173 = tt.reshape %ret_169 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_177 = tt.broadcast %ileft_176 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_178 = arith.muli %y_173, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_181 = tt.broadcast %iright_180 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_182 = tt.reshape %ileft_177 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_183 = tt.reshape %iright_181 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_184 = tt.reshape %new_idxs_172 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_188 = tt.broadcast %left_idx_187 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_192 = tt.broadcast %right_idx_191 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_193 = tt.reshape %left_idx_188 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_194 = tt.reshape %right_idx_192 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc187) + %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc188) + %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc189) + %cond_198 = arith.andi %eq_196, %cond_197 : tensor<32x16xi1> loc(#loc190) + %cond_199 = arith.ori %cond_195, %cond_198 : tensor<32x16xi1> loc(#loc191) + %cond_200 = arith.extui %cond_199 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_201 = arith.xori %cond_200, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc194) + %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_205 = arith.xori %ret_169, %ret_204 : tensor<32x16xi32> loc(#loc196) + %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc197) + %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<32x16xi32> loc(#loc199) + %y_209 = tt.reshape %ret_205 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_210 = arith.muli %y_209, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_213 = tt.broadcast %ileft_212 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_214 = arith.muli %y_209, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_217 = tt.broadcast %iright_216 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_218 = tt.reshape %ileft_213 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_219 = tt.reshape %iright_217 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_220 = tt.reshape %new_idxs_208 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_224 = tt.broadcast %left_idx_223 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_228 = tt.broadcast %right_idx_227 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_229 = tt.reshape %left_idx_224 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_230 = tt.reshape %right_idx_228 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc187) + %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc188) + %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc189) + %cond_234 = arith.andi %eq_232, %cond_233 : tensor<32x16xi1> loc(#loc190) + %cond_235 = arith.ori %cond_231, %cond_234 : tensor<32x16xi1> loc(#loc191) + %cond_236 = arith.extui %cond_235 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_237 = arith.xori %cond_236, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc194) + %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_241 = arith.xori %ret_205, %ret_240 : tensor<32x16xi32> loc(#loc196) + %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc197) + %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<32x16xi32> loc(#loc199) + %y_245 = tt.reshape %ret_241 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc162) + %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc164) + %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<32x2x8xi32> loc(#loc164) + %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc201) + %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc166) + %ileft_250 = tt.broadcast %ileft_249 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc167) + %iright_251 = arith.muli %y_245, %flip_134 : tensor<32x2x8xi32> loc(#loc168) + %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc203) + %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc170) + %iright_254 = tt.broadcast %iright_253 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc171) + %ileft_255 = tt.reshape %ileft_250 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_256 = tt.reshape %iright_254 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_257 = tt.reshape %new_idxs_244 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc174) + %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<32x2x8xi32> loc(#loc176) + %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc206) + %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc178) + %left_idx_261 = tt.broadcast %left_idx_260 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc179) + %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<32x2x8xi32> loc(#loc181) + %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc209) + %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc183) + %right_idx_265 = tt.broadcast %right_idx_264 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc184) + %left_idx_266 = tt.reshape %left_idx_261 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_267 = tt.reshape %right_idx_265 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc187) + %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc188) + %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc189) + %cond_271 = arith.andi %eq_269, %cond_270 : tensor<32x16xi1> loc(#loc190) + %cond_272 = arith.ori %cond_268, %cond_271 : tensor<32x16xi1> loc(#loc191) + %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc194) + %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_275 = arith.xori %ret_241, %ret_274 : tensor<32x16xi32> loc(#loc196) + %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc197) + %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<32x16xi32> loc(#loc199) + %y_279 = tt.reshape %ret_275 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162) + %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<64x2x4xi32> loc(#loc164) + %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201) + %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166) + %ileft_283 = tt.broadcast %ileft_282 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167) + %iright_284 = arith.muli %y_279, %flip_59 : tensor<64x2x4xi32> loc(#loc168) + %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203) + %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170) + %iright_287 = tt.broadcast %iright_286 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171) + %ileft_288 = tt.reshape %ileft_283 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_289 = tt.reshape %iright_287 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_290 = tt.reshape %new_idxs_278 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174) + %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<64x2x4xi32> loc(#loc176) + %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206) + %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178) + %left_idx_294 = tt.broadcast %left_idx_293 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179) + %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<64x2x4xi32> loc(#loc181) + %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209) + %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183) + %right_idx_298 = tt.broadcast %right_idx_297 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184) + %left_idx_299 = tt.reshape %left_idx_294 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_300 = tt.reshape %right_idx_298 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc187) + %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc188) + %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc189) + %cond_304 = arith.andi %eq_302, %cond_303 : tensor<32x16xi1> loc(#loc190) + %cond_305 = arith.ori %cond_301, %cond_304 : tensor<32x16xi1> loc(#loc191) + %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc194) + %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_308 = arith.xori %ret_275, %ret_307 : tensor<32x16xi32> loc(#loc196) + %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc197) + %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<32x16xi32> loc(#loc199) + %y_312 = tt.reshape %ret_308 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_316 = tt.broadcast %ileft_315 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_317 = arith.muli %y_312, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_320 = tt.broadcast %iright_319 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_321 = tt.reshape %ileft_316 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_322 = tt.reshape %iright_320 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_323 = tt.reshape %new_idxs_311 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_327 = tt.broadcast %left_idx_326 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_331 = tt.broadcast %right_idx_330 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_332 = tt.reshape %left_idx_327 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_333 = tt.reshape %right_idx_331 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc187) + %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc188) + %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc189) + %cond_337 = arith.andi %eq_335, %cond_336 : tensor<32x16xi1> loc(#loc190) + %cond_338 = arith.ori %cond_334, %cond_337 : tensor<32x16xi1> loc(#loc191) + %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc194) + %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_341 = arith.xori %ret_308, %ret_340 : tensor<32x16xi32> loc(#loc196) + %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc197) + %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<32x16xi32> loc(#loc199) + %y_345 = tt.reshape %ret_341 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_346 = arith.muli %y_345, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_349 = tt.broadcast %ileft_348 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_350 = arith.muli %y_345, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_353 = tt.broadcast %iright_352 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_354 = tt.reshape %ileft_349 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_355 = tt.reshape %iright_353 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_356 = tt.reshape %new_idxs_344 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_360 = tt.broadcast %left_idx_359 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_364 = tt.broadcast %right_idx_363 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_365 = tt.reshape %left_idx_360 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_366 = tt.reshape %right_idx_364 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc187) + %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc188) + %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc189) + %cond_370 = arith.andi %eq_368, %cond_369 : tensor<32x16xi1> loc(#loc190) + %cond_371 = arith.ori %cond_367, %cond_370 : tensor<32x16xi1> loc(#loc191) + %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc197) + %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<32x16xi32> loc(#loc199) + %tmp7 = arith.extsi %tmp0_19 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc149) + %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc86) + %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))): + %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200) + tt.reduce.return %tmp11_379 : i64 loc(#loc160) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc160) + %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc151) + %tmp14 = arith.trunci %tmp11_376 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc152) + %0 = arith.muli %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc73) + %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc74) + %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc74) + %3 = arith.addi %1, %2 : tensor<32x16xi32> loc(#loc74) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc75) + %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc75) + tt.store %5, %new_idxs_374, %tmp0_18 : tensor<32x16x!tt.ptr> loc(#loc76) + %6 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc77) + %7 = tt.addptr %6, %xindex_6 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc77) + tt.store %7, %tmp14, %xmask_7 : tensor<32x1x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:33) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:23) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":40:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4) +#loc85 = loc(callsite(#loc1 at #loc2)) +#loc86 = loc("tmp10"(#loc3)) +#loc87 = loc("tmp0"(#loc4)) +#loc88 = loc("tmp0"(#loc5)) +#loc89 = loc("xmask"(#loc6)) +#loc90 = loc("xoffset"(#loc7)) +#loc91 = loc("xoffset"(#loc8)) +#loc92 = loc("xindex"(#loc9)) +#loc93 = loc("xindex"(#loc10)) +#loc94 = loc("xindex"(#loc11)) +#loc95 = loc("r0_index"(#loc12)) +#loc96 = loc("r0_index"(#loc13)) +#loc97 = loc("x0"(#loc14)) +#loc98 = loc("x1"(#loc15)) +#loc99 = loc("tmp0"(#loc16)) +#loc100 = loc("tmp0"(#loc17)) +#loc101 = loc("tmp0"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp2"(#loc20)) +#loc104 = loc("tmp4"(#loc21)) +#loc105 = loc("flip"(#loc22)) +#loc107 = loc("flip"(#loc24)) +#loc108 = loc("flip"(#loc25)) +#loc109 = loc("flip"(#loc26)) +#loc110 = loc("y"(#loc27)) +#loc111 = loc("left_mask"(#loc29)) +#loc112 = loc("ileft"(#loc30)) +#loc114 = loc("ileft"(#loc34)) +#loc115 = loc("ileft"(#loc35)) +#loc116 = loc("iright"(#loc36)) +#loc118 = loc("iright"(#loc38)) +#loc119 = loc("iright"(#loc39)) +#loc120 = loc("ileft"(#loc40)) +#loc121 = loc("iright"(#loc41)) +#loc122 = loc("y_idx"(#loc42)) +#loc123 = loc("left_idx"(#loc43)) +#loc124 = loc("left_idx"(#loc44)) +#loc125 = loc("input"(#loc45)) +#loc127 = loc("left_idx"(#loc47)) +#loc128 = loc("left_idx"(#loc48)) +#loc129 = loc("right_idx"(#loc49)) +#loc130 = loc("right_idx"(#loc50)) +#loc132 = loc("right_idx"(#loc52)) +#loc133 = loc("right_idx"(#loc53)) +#loc134 = loc("left_idx"(#loc54)) +#loc135 = loc("right_idx"(#loc55)) +#loc136 = loc("cond"(#loc56)) +#loc137 = loc("eq"(#loc57)) +#loc138 = loc("cond"(#loc58)) +#loc139 = loc("cond"(#loc59)) +#loc140 = loc("cond"(#loc60)) +#loc141 = loc("cond"(#loc61)) +#loc142 = loc("cond"(#loc62)) +#loc143 = loc("ret"(#loc63)) +#loc144 = loc("ret"(#loc64)) +#loc145 = loc("ret"(#loc65)) +#loc146 = loc("new_idxs"(#loc66)) +#loc147 = loc("new_idxs"(#loc67)) +#loc148 = loc("new_idxs"(#loc68)) +#loc149 = loc("tmp7"(#loc69)) +#loc151 = loc("tmp11"(#loc71)) +#loc152 = loc("tmp14"(#loc72)) +#loc153 = loc(callsite(#loc105 at #loc106)) +#loc154 = loc(callsite(#loc107 at #loc106)) +#loc155 = loc(callsite(#loc108 at #loc106)) +#loc156 = loc(callsite(#loc109 at #loc106)) +#loc158 = loc("cond"(#loc136)) +#loc159 = loc("eq"(#loc137)) +#loc160 = loc(callsite(#loc31 at #loc150)) +#loc162 = loc(callsite(#loc110 at #loc157)) +#loc163 = loc(callsite(#loc111 at #loc157)) +#loc164 = loc(callsite(#loc112 at #loc157)) +#loc166 = loc(callsite(#loc114 at #loc157)) +#loc167 = loc(callsite(#loc115 at #loc157)) +#loc168 = loc(callsite(#loc116 at #loc157)) +#loc170 = loc(callsite(#loc118 at #loc157)) +#loc171 = loc(callsite(#loc119 at #loc157)) +#loc172 = loc(callsite(#loc120 at #loc157)) +#loc173 = loc(callsite(#loc121 at #loc157)) +#loc174 = loc(callsite(#loc122 at #loc157)) +#loc175 = loc(callsite(#loc123 at #loc157)) +#loc176 = loc(callsite(#loc124 at #loc157)) +#loc178 = loc(callsite(#loc127 at #loc157)) +#loc179 = loc(callsite(#loc128 at #loc157)) +#loc180 = loc(callsite(#loc129 at #loc157)) +#loc181 = loc(callsite(#loc130 at #loc157)) +#loc183 = loc(callsite(#loc132 at #loc157)) +#loc184 = loc(callsite(#loc133 at #loc157)) +#loc185 = loc(callsite(#loc134 at #loc157)) +#loc186 = loc(callsite(#loc135 at #loc157)) +#loc187 = loc(callsite(#loc158 at #loc157)) +#loc188 = loc(callsite(#loc159 at #loc157)) +#loc189 = loc(callsite(#loc138 at #loc157)) +#loc190 = loc(callsite(#loc139 at #loc157)) +#loc191 = loc(callsite(#loc140 at #loc157)) +#loc192 = loc(callsite(#loc141 at #loc157)) +#loc193 = loc(callsite(#loc142 at #loc157)) +#loc194 = loc(callsite(#loc143 at #loc157)) +#loc195 = loc(callsite(#loc144 at #loc157)) +#loc196 = loc(callsite(#loc145 at #loc157)) +#loc197 = loc(callsite(#loc146 at #loc157)) +#loc198 = loc(callsite(#loc147 at #loc157)) +#loc199 = loc(callsite(#loc148 at #loc157)) +#loc200 = loc(callsite(#loc33 at #loc160)) +#loc201 = loc(callsite(#loc31 at #loc165)) +#loc203 = loc(callsite(#loc31 at #loc169)) +#loc205 = loc(callsite(#loc125 at #loc177)) +#loc206 = loc(callsite(#loc31 at #loc177)) +#loc208 = loc(callsite(#loc125 at #loc182)) +#loc209 = loc(callsite(#loc31 at #loc182)) +#loc211 = loc(callsite(#loc33 at #loc201)) +#loc212 = loc(callsite(#loc33 at #loc203)) +#loc213 = loc(callsite(#loc33 at #loc206)) +#loc214 = loc(callsite(#loc33 at #loc209)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5af6cf54a2f50c6d76e4986bd40419d12f05e71e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2ddb818339fee4b1766cd1cf4abdc96b40b7e433 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..943fbb2c39792e9896a6eb0b756bfbaf891c8c5a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "0e4cc76a2ea35b5a309428a12917c533732d30887808410cf8726b5bed41ff40", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..882d69ccd4ac9f167442bd71b4d3354e4a5a1978 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir @@ -0,0 +1,1166 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 6, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 448, !dbg !9 + %13 = and i32 %11, 63, !dbg !9 + %14 = lshr exact i32 %12, 6, !dbg !9 + %15 = or disjoint i32 %14, 8, !dbg !9 + %16 = or disjoint i32 %14, 16, !dbg !9 + %17 = or disjoint i32 %14, 24, !dbg !9 + %18 = insertelement <4 x i32> poison, i32 %14, i64 0, !dbg !9 + %19 = shufflevector <4 x i32> %18, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !9 + %20 = or disjoint <4 x i32> %19, , !dbg !9 + %21 = insertelement <8 x i32> poison, i32 %17, i64 4, !dbg !10 + %22 = insertelement <8 x i32> %21, i32 %16, i64 5, !dbg !10 + %23 = insertelement <8 x i32> %22, i32 %15, i64 6, !dbg !10 + %24 = insertelement <8 x i32> %23, i32 %14, i64 7, !dbg !10 + %25 = shufflevector <4 x i32> %20, <4 x i32> poison, <8 x i32> , !dbg !10 + %26 = shufflevector <8 x i32> %25, <8 x i32> %24, <8 x i32> , !dbg !10 + %27 = insertelement <8 x i32> poison, i32 %10, i64 0, !dbg !10 + %28 = shufflevector <8 x i32> %27, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !10 + %29 = or disjoint <8 x i32> %26, %28, !dbg !10 + %30 = insertelement <8 x i32> poison, i32 %4, i64 0, !dbg !11 + %31 = shufflevector <8 x i32> %30, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !11 + %32 = icmp slt <8 x i32> %29, %31, !dbg !11 + %33 = extractelement <8 x i32> %29, i64 7, !dbg !12 + %34 = sext i32 %33 to i64, !dbg !12 + %35 = extractelement <8 x i32> %29, i64 6, !dbg !12 + %36 = sext i32 %35 to i64, !dbg !12 + %37 = extractelement <8 x i32> %29, i64 5, !dbg !12 + %38 = sext i32 %37 to i64, !dbg !12 + %39 = extractelement <8 x i32> %29, i64 4, !dbg !12 + %40 = sext i32 %39 to i64, !dbg !12 + %41 = extractelement <8 x i32> %29, i64 3, !dbg !12 + %42 = sext i32 %41 to i64, !dbg !12 + %43 = extractelement <8 x i32> %29, i64 2, !dbg !12 + %44 = sext i32 %43 to i64, !dbg !12 + %45 = extractelement <8 x i32> %29, i64 1, !dbg !12 + %46 = sext i32 %45 to i64, !dbg !12 + %47 = extractelement <8 x i32> %29, i64 0, !dbg !12 + %48 = sext i32 %47 to i64, !dbg !12 + %.frozen = freeze i64 %34, !dbg !13 + %.frozen70 = freeze i64 %2, !dbg !13 + %49 = sdiv i64 %.frozen, %.frozen70, !dbg !13 + %50 = mul i64 %49, %.frozen70, !dbg !12 + %.decomposed = sub i64 %.frozen, %50, !dbg !12 + %.frozen71 = freeze i64 %36, !dbg !13 + %.frozen72 = freeze i64 %2, !dbg !13 + %51 = sdiv i64 %.frozen71, %.frozen72, !dbg !13 + %52 = mul i64 %51, %.frozen72, !dbg !12 + %.decomposed73 = sub i64 %.frozen71, %52, !dbg !12 + %.frozen74 = freeze i64 %38, !dbg !13 + %.frozen75 = freeze i64 %2, !dbg !13 + %53 = sdiv i64 %.frozen74, %.frozen75, !dbg !13 + %54 = mul i64 %53, %.frozen75, !dbg !12 + %.decomposed76 = sub i64 %.frozen74, %54, !dbg !12 + %.frozen77 = freeze i64 %40, !dbg !13 + %.frozen78 = freeze i64 %2, !dbg !13 + %55 = sdiv i64 %.frozen77, %.frozen78, !dbg !13 + %56 = mul i64 %55, %.frozen78, !dbg !12 + %.decomposed79 = sub i64 %.frozen77, %56, !dbg !12 + %.frozen80 = freeze i64 %42, !dbg !13 + %.frozen81 = freeze i64 %2, !dbg !13 + %57 = sdiv i64 %.frozen80, %.frozen81, !dbg !13 + %58 = mul i64 %57, %.frozen81, !dbg !12 + %.decomposed82 = sub i64 %.frozen80, %58, !dbg !12 + %.frozen83 = freeze i64 %44, !dbg !13 + %.frozen84 = freeze i64 %2, !dbg !13 + %59 = sdiv i64 %.frozen83, %.frozen84, !dbg !13 + %60 = mul i64 %59, %.frozen84, !dbg !12 + %.decomposed85 = sub i64 %.frozen83, %60, !dbg !12 + %.frozen86 = freeze i64 %46, !dbg !13 + %.frozen87 = freeze i64 %2, !dbg !13 + %61 = sdiv i64 %.frozen86, %.frozen87, !dbg !13 + %62 = mul i64 %61, %.frozen87, !dbg !12 + %.decomposed88 = sub i64 %.frozen86, %62, !dbg !12 + %.frozen89 = freeze i64 %48, !dbg !13 + %.frozen90 = freeze i64 %2, !dbg !13 + %63 = sdiv i64 %.frozen89, %.frozen90, !dbg !13 + %64 = mul i64 %63, %.frozen90, !dbg !12 + %.decomposed91 = sub i64 %.frozen89, %64, !dbg !12 + %65 = mul i64 %49, %3, !dbg !14 + %66 = mul i64 %51, %3, !dbg !14 + %67 = mul i64 %53, %3, !dbg !14 + %68 = mul i64 %55, %3, !dbg !14 + %69 = mul i64 %57, %3, !dbg !14 + %70 = mul i64 %59, %3, !dbg !14 + %71 = mul i64 %61, %3, !dbg !14 + %72 = mul i64 %63, %3, !dbg !14 + %.idx = mul nsw i64 %.decomposed, 128000 + %73 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %73, i64 %65, !dbg !15 + %.idx1 = mul nsw i64 %.decomposed73, 128000 + %74 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx1 + %invariant.gep9 = getelementptr float, ptr addrspace(1) %74, i64 %66, !dbg !15 + %.idx2 = mul nsw i64 %.decomposed76, 128000 + %75 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx2 + %invariant.gep11 = getelementptr float, ptr addrspace(1) %75, i64 %67, !dbg !15 + %.idx3 = mul nsw i64 %.decomposed79, 128000 + %76 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx3 + %invariant.gep13 = getelementptr float, ptr addrspace(1) %76, i64 %68, !dbg !15 + %.idx4 = mul nsw i64 %.decomposed82, 128000 + %77 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx4 + %invariant.gep15 = getelementptr float, ptr addrspace(1) %77, i64 %69, !dbg !15 + %.idx5 = mul nsw i64 %.decomposed85, 128000 + %78 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx5 + %invariant.gep17 = getelementptr float, ptr addrspace(1) %78, i64 %70, !dbg !15 + %.idx6 = mul nsw i64 %.decomposed88, 128000 + %79 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx6 + %invariant.gep19 = getelementptr float, ptr addrspace(1) %79, i64 %71, !dbg !15 + %.idx7 = mul nsw i64 %.decomposed91, 128000 + %80 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx7 + %invariant.gep21 = getelementptr float, ptr addrspace(1) %80, i64 %72, !dbg !15 + %81 = zext nneg i32 %13 to i64, !dbg !15 + %82 = extractelement <8 x i1> %32, i64 0, !dbg !16 + %83 = extractelement <8 x i1> %32, i64 1, !dbg !16 + %84 = extractelement <8 x i1> %32, i64 2, !dbg !16 + %85 = extractelement <8 x i1> %32, i64 3, !dbg !16 + %86 = extractelement <8 x i1> %32, i64 4, !dbg !16 + %87 = extractelement <8 x i1> %32, i64 5, !dbg !16 + %88 = extractelement <8 x i1> %32, i64 6, !dbg !16 + %89 = extractelement <8 x i1> %32, i64 7, !dbg !16 + br label %90, !dbg !15 + +90: ; preds = %8, %90 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %90 ] + %91 = phi <8 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %139, %90 ] + %92 = phi <8 x i32> [ splat (i32 2147483647), %8 ], [ %140, %90 ] + %93 = or disjoint i64 %indvars.iv, %81, !dbg !17 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %93, !dbg !18 + %gep10 = getelementptr float, ptr addrspace(1) %invariant.gep9, i64 %93, !dbg !18 + %gep12 = getelementptr float, ptr addrspace(1) %invariant.gep11, i64 %93, !dbg !18 + %gep14 = getelementptr float, ptr addrspace(1) %invariant.gep13, i64 %93, !dbg !18 + %gep16 = getelementptr float, ptr addrspace(1) %invariant.gep15, i64 %93, !dbg !18 + %gep18 = getelementptr float, ptr addrspace(1) %invariant.gep17, i64 %93, !dbg !18 + %gep20 = getelementptr float, ptr addrspace(1) %invariant.gep19, i64 %93, !dbg !18 + %gep22 = getelementptr float, ptr addrspace(1) %invariant.gep21, i64 %93, !dbg !18 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %95 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %94, i1 %89) #4, !dbg !16 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %97 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep10, i64 %96, i1 %88) #4, !dbg !16 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %99 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep12, i64 %98, i1 %87) #4, !dbg !16 + %100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %101 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep14, i64 %100, i1 %86) #4, !dbg !16 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %103 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep16, i64 %102, i1 %85) #4, !dbg !16 + %104 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %105 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep18, i64 %104, i1 %84) #4, !dbg !16 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %107 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep20, i64 %106, i1 %83) #4, !dbg !16 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16 + %109 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep22, i64 %108, i1 %82) #4, !dbg !16 + %110 = fcmp uno <8 x float> %91, zeroinitializer, !dbg !19 + %111 = trunc nuw nsw i64 %93 to i32, !dbg !23 + %112 = insertelement <8 x i32> poison, i32 %109, i64 0, !dbg !16 + %113 = insertelement <8 x i32> %112, i32 %107, i64 1, !dbg !16 + %114 = insertelement <8 x i32> %113, i32 %105, i64 2, !dbg !16 + %115 = insertelement <8 x i32> %114, i32 %103, i64 3, !dbg !16 + %116 = insertelement <8 x i32> %115, i32 %101, i64 4, !dbg !16 + %117 = insertelement <8 x i32> %116, i32 %99, i64 5, !dbg !16 + %118 = insertelement <8 x i32> %117, i32 %97, i64 6, !dbg !16 + %119 = insertelement <8 x i32> %118, i32 %95, i64 7, !dbg !16 + %120 = bitcast <8 x i32> %119 to <8 x float>, !dbg !16 + %121 = fcmp ogt <8 x float> %91, %120, !dbg !24 + %122 = fcmp oeq <8 x float> %91, %120, !dbg !25 + %123 = fcmp uno <8 x float> %120, zeroinitializer, !dbg !26 + %124 = xor <8 x i1> %123, splat (i1 true), !dbg !27 + %125 = and <8 x i1> %110, %124, !dbg !28 + %126 = or <8 x i1> %121, %125, !dbg !29 + %127 = and <8 x i1> %110, %123, !dbg !30 + %128 = or <8 x i1> %122, %127, !dbg !31 + %129 = insertelement <8 x i64> poison, i64 %93, i64 0, !dbg !32 + %130 = shufflevector <8 x i64> %129, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !32 + %131 = sext <8 x i32> %92 to <8 x i64>, !dbg !32 + %132 = icmp sgt <8 x i64> %130, %131, !dbg !32 + %133 = and <8 x i1> %132, %128, !dbg !33 + %134 = or <8 x i1> %126, %133, !dbg !34 + %135 = select <8 x i1> %134, <8 x float> %91, <8 x float> %120, !dbg !35 + %136 = insertelement <8 x i32> poison, i32 %111, i64 0, !dbg !23 + %137 = shufflevector <8 x i32> %136, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !23 + %138 = select <8 x i1> %134, <8 x i32> %92, <8 x i32> %137, !dbg !23 + %139 = select <8 x i1> %32, <8 x float> %135, <8 x float> %91, !dbg !36 + %140 = select <8 x i1> %32, <8 x i32> %138, <8 x i32> %92, !dbg !37 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 64, !dbg !15 + %141 = icmp samesign ult i64 %indvars.iv, 31936, !dbg !15 + br i1 %141, label %90, label %142, !dbg !15 + +142: ; preds = %90 + %143 = or disjoint i32 %10, %13, !dbg !10 + %144 = icmp slt i32 %143, %4, !dbg !11 + %145 = and i32 %11, 31, !dbg !9 + %146 = lshr i32 %11, 5, !dbg !9 + %147 = extractelement <8 x float> %139, i64 7, !dbg !38 + %148 = bitcast float %147 to i32, !dbg !38 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 16, i32 31), !dbg !38 + %150 = bitcast i32 %149 to float, !dbg !38 + %151 = extractelement <8 x i32> %140, i64 7, !dbg !38 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 16, i32 31), !dbg !38 + %153 = fcmp ogt float %147, %150, !dbg !40 + %154 = fcmp oeq float %147, %150, !dbg !41 + %155 = fcmp uno <8 x float> %139, zeroinitializer, !dbg !42 + %156 = fcmp uno float %150, 0.000000e+00, !dbg !43 + %157 = xor i1 %156, true, !dbg !44 + %158 = extractelement <8 x i1> %155, i64 7, !dbg !45 + %159 = and i1 %158, %157, !dbg !46 + %160 = or i1 %153, %159, !dbg !47 + %161 = and i1 %158, %156, !dbg !45 + %162 = or i1 %154, %161, !dbg !48 + %163 = icmp slt i32 %151, %152, !dbg !49 + %164 = and i1 %163, %162, !dbg !50 + %165 = or i1 %160, %164, !dbg !51 + %166 = select i1 %165, float %147, float %150, !dbg !52 + %167 = select i1 %165, i32 %151, i32 %152, !dbg !53 + %168 = bitcast float %166 to i32, !dbg !38 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 8, i32 31), !dbg !38 + %170 = bitcast i32 %169 to float, !dbg !38 + %171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 8, i32 31), !dbg !38 + %172 = fcmp ogt float %166, %170, !dbg !40 + %173 = fcmp oeq float %166, %170, !dbg !41 + %174 = fcmp uno float %166, 0.000000e+00, !dbg !42 + %175 = fcmp uno float %170, 0.000000e+00, !dbg !43 + %176 = xor i1 %175, true, !dbg !44 + %177 = and i1 %174, %176, !dbg !46 + %178 = or i1 %172, %177, !dbg !47 + %179 = and i1 %175, %174, !dbg !45 + %180 = or i1 %173, %179, !dbg !48 + %181 = icmp slt i32 %167, %171, !dbg !49 + %182 = and i1 %181, %180, !dbg !50 + %183 = or i1 %178, %182, !dbg !51 + %184 = select i1 %183, float %166, float %170, !dbg !52 + %185 = select i1 %183, i32 %167, i32 %171, !dbg !53 + %186 = bitcast float %184 to i32, !dbg !38 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 4, i32 31), !dbg !38 + %188 = bitcast i32 %187 to float, !dbg !38 + %189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !38 + %190 = fcmp ogt float %184, %188, !dbg !40 + %191 = fcmp oeq float %184, %188, !dbg !41 + %192 = fcmp uno float %184, 0.000000e+00, !dbg !42 + %193 = fcmp uno float %188, 0.000000e+00, !dbg !43 + %194 = xor i1 %193, true, !dbg !44 + %195 = and i1 %192, %194, !dbg !46 + %196 = or i1 %190, %195, !dbg !47 + %197 = and i1 %193, %192, !dbg !45 + %198 = or i1 %191, %197, !dbg !48 + %199 = icmp slt i32 %185, %189, !dbg !49 + %200 = and i1 %199, %198, !dbg !50 + %201 = or i1 %196, %200, !dbg !51 + %202 = select i1 %201, float %184, float %188, !dbg !52 + %203 = select i1 %201, i32 %185, i32 %189, !dbg !53 + %204 = bitcast float %202 to i32, !dbg !38 + %205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 2, i32 31), !dbg !38 + %206 = bitcast i32 %205 to float, !dbg !38 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !38 + %208 = fcmp ogt float %202, %206, !dbg !40 + %209 = fcmp oeq float %202, %206, !dbg !41 + %210 = fcmp uno float %202, 0.000000e+00, !dbg !42 + %211 = fcmp uno float %206, 0.000000e+00, !dbg !43 + %212 = xor i1 %211, true, !dbg !44 + %213 = and i1 %210, %212, !dbg !46 + %214 = or i1 %208, %213, !dbg !47 + %215 = and i1 %211, %210, !dbg !45 + %216 = or i1 %209, %215, !dbg !48 + %217 = icmp slt i32 %203, %207, !dbg !49 + %218 = and i1 %217, %216, !dbg !50 + %219 = or i1 %214, %218, !dbg !51 + %220 = select i1 %219, float %202, float %206, !dbg !52 + %221 = select i1 %219, i32 %203, i32 %207, !dbg !53 + %222 = bitcast float %220 to i32, !dbg !38 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !38 + %224 = bitcast i32 %223 to float, !dbg !38 + %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !38 + %226 = fcmp ogt float %220, %224, !dbg !40 + %227 = fcmp oeq float %220, %224, !dbg !41 + %228 = fcmp uno float %220, 0.000000e+00, !dbg !42 + %229 = fcmp uno float %224, 0.000000e+00, !dbg !43 + %230 = xor i1 %229, true, !dbg !44 + %231 = and i1 %228, %230, !dbg !46 + %232 = or i1 %226, %231, !dbg !47 + %233 = and i1 %229, %228, !dbg !45 + %234 = or i1 %227, %233, !dbg !48 + %235 = icmp slt i32 %221, %225, !dbg !49 + %236 = and i1 %235, %234, !dbg !50 + %237 = or i1 %232, %236, !dbg !51 + %238 = select i1 %237, i32 %221, i32 %225, !dbg !53 + %239 = extractelement <8 x float> %139, i64 6, !dbg !38 + %240 = bitcast float %239 to i32, !dbg !38 + %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 16, i32 31), !dbg !38 + %242 = bitcast i32 %241 to float, !dbg !38 + %243 = extractelement <8 x i32> %140, i64 6, !dbg !38 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 16, i32 31), !dbg !38 + %245 = fcmp ogt float %239, %242, !dbg !40 + %246 = fcmp oeq float %239, %242, !dbg !41 + %247 = fcmp uno float %242, 0.000000e+00, !dbg !43 + %248 = xor i1 %247, true, !dbg !44 + %249 = extractelement <8 x i1> %155, i64 6, !dbg !45 + %250 = and i1 %249, %248, !dbg !46 + %251 = or i1 %245, %250, !dbg !47 + %252 = and i1 %249, %247, !dbg !45 + %253 = or i1 %246, %252, !dbg !48 + %254 = icmp slt i32 %243, %244, !dbg !49 + %255 = and i1 %254, %253, !dbg !50 + %256 = or i1 %251, %255, !dbg !51 + %257 = select i1 %256, float %239, float %242, !dbg !52 + %258 = select i1 %256, i32 %243, i32 %244, !dbg !53 + %259 = bitcast float %257 to i32, !dbg !38 + %260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 8, i32 31), !dbg !38 + %261 = bitcast i32 %260 to float, !dbg !38 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 8, i32 31), !dbg !38 + %263 = fcmp ogt float %257, %261, !dbg !40 + %264 = fcmp oeq float %257, %261, !dbg !41 + %265 = fcmp uno float %257, 0.000000e+00, !dbg !42 + %266 = fcmp uno float %261, 0.000000e+00, !dbg !43 + %267 = xor i1 %266, true, !dbg !44 + %268 = and i1 %265, %267, !dbg !46 + %269 = or i1 %263, %268, !dbg !47 + %270 = and i1 %266, %265, !dbg !45 + %271 = or i1 %264, %270, !dbg !48 + %272 = icmp slt i32 %258, %262, !dbg !49 + %273 = and i1 %272, %271, !dbg !50 + %274 = or i1 %269, %273, !dbg !51 + %275 = select i1 %274, float %257, float %261, !dbg !52 + %276 = select i1 %274, i32 %258, i32 %262, !dbg !53 + %277 = bitcast float %275 to i32, !dbg !38 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 4, i32 31), !dbg !38 + %279 = bitcast i32 %278 to float, !dbg !38 + %280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 4, i32 31), !dbg !38 + %281 = fcmp ogt float %275, %279, !dbg !40 + %282 = fcmp oeq float %275, %279, !dbg !41 + %283 = fcmp uno float %275, 0.000000e+00, !dbg !42 + %284 = fcmp uno float %279, 0.000000e+00, !dbg !43 + %285 = xor i1 %284, true, !dbg !44 + %286 = and i1 %283, %285, !dbg !46 + %287 = or i1 %281, %286, !dbg !47 + %288 = and i1 %284, %283, !dbg !45 + %289 = or i1 %282, %288, !dbg !48 + %290 = icmp slt i32 %276, %280, !dbg !49 + %291 = and i1 %290, %289, !dbg !50 + %292 = or i1 %287, %291, !dbg !51 + %293 = select i1 %292, float %275, float %279, !dbg !52 + %294 = select i1 %292, i32 %276, i32 %280, !dbg !53 + %295 = bitcast float %293 to i32, !dbg !38 + %296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 2, i32 31), !dbg !38 + %297 = bitcast i32 %296 to float, !dbg !38 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !38 + %299 = fcmp ogt float %293, %297, !dbg !40 + %300 = fcmp oeq float %293, %297, !dbg !41 + %301 = fcmp uno float %293, 0.000000e+00, !dbg !42 + %302 = fcmp uno float %297, 0.000000e+00, !dbg !43 + %303 = xor i1 %302, true, !dbg !44 + %304 = and i1 %301, %303, !dbg !46 + %305 = or i1 %299, %304, !dbg !47 + %306 = and i1 %302, %301, !dbg !45 + %307 = or i1 %300, %306, !dbg !48 + %308 = icmp slt i32 %294, %298, !dbg !49 + %309 = and i1 %308, %307, !dbg !50 + %310 = or i1 %305, %309, !dbg !51 + %311 = select i1 %310, float %293, float %297, !dbg !52 + %312 = select i1 %310, i32 %294, i32 %298, !dbg !53 + %313 = bitcast float %311 to i32, !dbg !38 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 1, i32 31), !dbg !38 + %315 = bitcast i32 %314 to float, !dbg !38 + %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !38 + %317 = fcmp ogt float %311, %315, !dbg !40 + %318 = fcmp oeq float %311, %315, !dbg !41 + %319 = fcmp uno float %311, 0.000000e+00, !dbg !42 + %320 = fcmp uno float %315, 0.000000e+00, !dbg !43 + %321 = xor i1 %320, true, !dbg !44 + %322 = and i1 %319, %321, !dbg !46 + %323 = or i1 %317, %322, !dbg !47 + %324 = and i1 %320, %319, !dbg !45 + %325 = or i1 %318, %324, !dbg !48 + %326 = icmp slt i32 %312, %316, !dbg !49 + %327 = and i1 %326, %325, !dbg !50 + %328 = or i1 %323, %327, !dbg !51 + %329 = select i1 %328, i32 %312, i32 %316, !dbg !53 + %330 = extractelement <8 x float> %139, i64 5, !dbg !38 + %331 = bitcast float %330 to i32, !dbg !38 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %331, i32 16, i32 31), !dbg !38 + %333 = bitcast i32 %332 to float, !dbg !38 + %334 = extractelement <8 x i32> %140, i64 5, !dbg !38 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 16, i32 31), !dbg !38 + %336 = fcmp ogt float %330, %333, !dbg !40 + %337 = fcmp oeq float %330, %333, !dbg !41 + %338 = fcmp uno float %333, 0.000000e+00, !dbg !43 + %339 = xor i1 %338, true, !dbg !44 + %340 = extractelement <8 x i1> %155, i64 5, !dbg !45 + %341 = and i1 %340, %339, !dbg !46 + %342 = or i1 %336, %341, !dbg !47 + %343 = and i1 %340, %338, !dbg !45 + %344 = or i1 %337, %343, !dbg !48 + %345 = icmp slt i32 %334, %335, !dbg !49 + %346 = and i1 %345, %344, !dbg !50 + %347 = or i1 %342, %346, !dbg !51 + %348 = select i1 %347, float %330, float %333, !dbg !52 + %349 = select i1 %347, i32 %334, i32 %335, !dbg !53 + %350 = bitcast float %348 to i32, !dbg !38 + %351 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %350, i32 8, i32 31), !dbg !38 + %352 = bitcast i32 %351 to float, !dbg !38 + %353 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 8, i32 31), !dbg !38 + %354 = fcmp ogt float %348, %352, !dbg !40 + %355 = fcmp oeq float %348, %352, !dbg !41 + %356 = fcmp uno float %348, 0.000000e+00, !dbg !42 + %357 = fcmp uno float %352, 0.000000e+00, !dbg !43 + %358 = xor i1 %357, true, !dbg !44 + %359 = and i1 %356, %358, !dbg !46 + %360 = or i1 %354, %359, !dbg !47 + %361 = and i1 %357, %356, !dbg !45 + %362 = or i1 %355, %361, !dbg !48 + %363 = icmp slt i32 %349, %353, !dbg !49 + %364 = and i1 %363, %362, !dbg !50 + %365 = or i1 %360, %364, !dbg !51 + %366 = select i1 %365, float %348, float %352, !dbg !52 + %367 = select i1 %365, i32 %349, i32 %353, !dbg !53 + %368 = bitcast float %366 to i32, !dbg !38 + %369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %368, i32 4, i32 31), !dbg !38 + %370 = bitcast i32 %369 to float, !dbg !38 + %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 4, i32 31), !dbg !38 + %372 = fcmp ogt float %366, %370, !dbg !40 + %373 = fcmp oeq float %366, %370, !dbg !41 + %374 = fcmp uno float %366, 0.000000e+00, !dbg !42 + %375 = fcmp uno float %370, 0.000000e+00, !dbg !43 + %376 = xor i1 %375, true, !dbg !44 + %377 = and i1 %374, %376, !dbg !46 + %378 = or i1 %372, %377, !dbg !47 + %379 = and i1 %375, %374, !dbg !45 + %380 = or i1 %373, %379, !dbg !48 + %381 = icmp slt i32 %367, %371, !dbg !49 + %382 = and i1 %381, %380, !dbg !50 + %383 = or i1 %378, %382, !dbg !51 + %384 = select i1 %383, float %366, float %370, !dbg !52 + %385 = select i1 %383, i32 %367, i32 %371, !dbg !53 + %386 = bitcast float %384 to i32, !dbg !38 + %387 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 2, i32 31), !dbg !38 + %388 = bitcast i32 %387 to float, !dbg !38 + %389 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 2, i32 31), !dbg !38 + %390 = fcmp ogt float %384, %388, !dbg !40 + %391 = fcmp oeq float %384, %388, !dbg !41 + %392 = fcmp uno float %384, 0.000000e+00, !dbg !42 + %393 = fcmp uno float %388, 0.000000e+00, !dbg !43 + %394 = xor i1 %393, true, !dbg !44 + %395 = and i1 %392, %394, !dbg !46 + %396 = or i1 %390, %395, !dbg !47 + %397 = and i1 %393, %392, !dbg !45 + %398 = or i1 %391, %397, !dbg !48 + %399 = icmp slt i32 %385, %389, !dbg !49 + %400 = and i1 %399, %398, !dbg !50 + %401 = or i1 %396, %400, !dbg !51 + %402 = select i1 %401, float %384, float %388, !dbg !52 + %403 = select i1 %401, i32 %385, i32 %389, !dbg !53 + %404 = bitcast float %402 to i32, !dbg !38 + %405 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %404, i32 1, i32 31), !dbg !38 + %406 = bitcast i32 %405 to float, !dbg !38 + %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 1, i32 31), !dbg !38 + %408 = fcmp ogt float %402, %406, !dbg !40 + %409 = fcmp oeq float %402, %406, !dbg !41 + %410 = fcmp uno float %402, 0.000000e+00, !dbg !42 + %411 = fcmp uno float %406, 0.000000e+00, !dbg !43 + %412 = xor i1 %411, true, !dbg !44 + %413 = and i1 %410, %412, !dbg !46 + %414 = or i1 %408, %413, !dbg !47 + %415 = and i1 %411, %410, !dbg !45 + %416 = or i1 %409, %415, !dbg !48 + %417 = icmp slt i32 %403, %407, !dbg !49 + %418 = and i1 %417, %416, !dbg !50 + %419 = or i1 %414, %418, !dbg !51 + %420 = select i1 %419, i32 %403, i32 %407, !dbg !53 + %421 = extractelement <8 x float> %139, i64 4, !dbg !38 + %422 = bitcast float %421 to i32, !dbg !38 + %423 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %422, i32 16, i32 31), !dbg !38 + %424 = bitcast i32 %423 to float, !dbg !38 + %425 = extractelement <8 x i32> %140, i64 4, !dbg !38 + %426 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %425, i32 16, i32 31), !dbg !38 + %427 = fcmp ogt float %421, %424, !dbg !40 + %428 = fcmp oeq float %421, %424, !dbg !41 + %429 = fcmp uno float %424, 0.000000e+00, !dbg !43 + %430 = xor i1 %429, true, !dbg !44 + %431 = extractelement <8 x i1> %155, i64 4, !dbg !45 + %432 = and i1 %431, %430, !dbg !46 + %433 = or i1 %427, %432, !dbg !47 + %434 = and i1 %431, %429, !dbg !45 + %435 = or i1 %428, %434, !dbg !48 + %436 = icmp slt i32 %425, %426, !dbg !49 + %437 = and i1 %436, %435, !dbg !50 + %438 = or i1 %433, %437, !dbg !51 + %439 = select i1 %438, float %421, float %424, !dbg !52 + %440 = select i1 %438, i32 %425, i32 %426, !dbg !53 + %441 = bitcast float %439 to i32, !dbg !38 + %442 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %441, i32 8, i32 31), !dbg !38 + %443 = bitcast i32 %442 to float, !dbg !38 + %444 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %440, i32 8, i32 31), !dbg !38 + %445 = fcmp ogt float %439, %443, !dbg !40 + %446 = fcmp oeq float %439, %443, !dbg !41 + %447 = fcmp uno float %439, 0.000000e+00, !dbg !42 + %448 = fcmp uno float %443, 0.000000e+00, !dbg !43 + %449 = xor i1 %448, true, !dbg !44 + %450 = and i1 %447, %449, !dbg !46 + %451 = or i1 %445, %450, !dbg !47 + %452 = and i1 %448, %447, !dbg !45 + %453 = or i1 %446, %452, !dbg !48 + %454 = icmp slt i32 %440, %444, !dbg !49 + %455 = and i1 %454, %453, !dbg !50 + %456 = or i1 %451, %455, !dbg !51 + %457 = select i1 %456, float %439, float %443, !dbg !52 + %458 = select i1 %456, i32 %440, i32 %444, !dbg !53 + %459 = bitcast float %457 to i32, !dbg !38 + %460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 4, i32 31), !dbg !38 + %461 = bitcast i32 %460 to float, !dbg !38 + %462 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 4, i32 31), !dbg !38 + %463 = fcmp ogt float %457, %461, !dbg !40 + %464 = fcmp oeq float %457, %461, !dbg !41 + %465 = fcmp uno float %457, 0.000000e+00, !dbg !42 + %466 = fcmp uno float %461, 0.000000e+00, !dbg !43 + %467 = xor i1 %466, true, !dbg !44 + %468 = and i1 %465, %467, !dbg !46 + %469 = or i1 %463, %468, !dbg !47 + %470 = and i1 %466, %465, !dbg !45 + %471 = or i1 %464, %470, !dbg !48 + %472 = icmp slt i32 %458, %462, !dbg !49 + %473 = and i1 %472, %471, !dbg !50 + %474 = or i1 %469, %473, !dbg !51 + %475 = select i1 %474, float %457, float %461, !dbg !52 + %476 = select i1 %474, i32 %458, i32 %462, !dbg !53 + %477 = bitcast float %475 to i32, !dbg !38 + %478 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %477, i32 2, i32 31), !dbg !38 + %479 = bitcast i32 %478 to float, !dbg !38 + %480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %476, i32 2, i32 31), !dbg !38 + %481 = fcmp ogt float %475, %479, !dbg !40 + %482 = fcmp oeq float %475, %479, !dbg !41 + %483 = fcmp uno float %475, 0.000000e+00, !dbg !42 + %484 = fcmp uno float %479, 0.000000e+00, !dbg !43 + %485 = xor i1 %484, true, !dbg !44 + %486 = and i1 %483, %485, !dbg !46 + %487 = or i1 %481, %486, !dbg !47 + %488 = and i1 %484, %483, !dbg !45 + %489 = or i1 %482, %488, !dbg !48 + %490 = icmp slt i32 %476, %480, !dbg !49 + %491 = and i1 %490, %489, !dbg !50 + %492 = or i1 %487, %491, !dbg !51 + %493 = select i1 %492, float %475, float %479, !dbg !52 + %494 = select i1 %492, i32 %476, i32 %480, !dbg !53 + %495 = bitcast float %493 to i32, !dbg !38 + %496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 1, i32 31), !dbg !38 + %497 = bitcast i32 %496 to float, !dbg !38 + %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 1, i32 31), !dbg !38 + %499 = fcmp ogt float %493, %497, !dbg !40 + %500 = fcmp oeq float %493, %497, !dbg !41 + %501 = fcmp uno float %493, 0.000000e+00, !dbg !42 + %502 = fcmp uno float %497, 0.000000e+00, !dbg !43 + %503 = xor i1 %502, true, !dbg !44 + %504 = and i1 %501, %503, !dbg !46 + %505 = or i1 %499, %504, !dbg !47 + %506 = and i1 %502, %501, !dbg !45 + %507 = or i1 %500, %506, !dbg !48 + %508 = icmp slt i32 %494, %498, !dbg !49 + %509 = and i1 %508, %507, !dbg !50 + %510 = or i1 %505, %509, !dbg !51 + %511 = select i1 %510, i32 %494, i32 %498, !dbg !53 + %512 = extractelement <8 x float> %139, i64 3, !dbg !38 + %513 = bitcast float %512 to i32, !dbg !38 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 16, i32 31), !dbg !38 + %515 = bitcast i32 %514 to float, !dbg !38 + %516 = extractelement <8 x i32> %140, i64 3, !dbg !38 + %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 16, i32 31), !dbg !38 + %518 = fcmp ogt float %512, %515, !dbg !40 + %519 = fcmp oeq float %512, %515, !dbg !41 + %520 = fcmp uno float %515, 0.000000e+00, !dbg !43 + %521 = xor i1 %520, true, !dbg !44 + %522 = extractelement <8 x i1> %155, i64 3, !dbg !45 + %523 = and i1 %522, %521, !dbg !46 + %524 = or i1 %518, %523, !dbg !47 + %525 = and i1 %522, %520, !dbg !45 + %526 = or i1 %519, %525, !dbg !48 + %527 = icmp slt i32 %516, %517, !dbg !49 + %528 = and i1 %527, %526, !dbg !50 + %529 = or i1 %524, %528, !dbg !51 + %530 = select i1 %529, float %512, float %515, !dbg !52 + %531 = select i1 %529, i32 %516, i32 %517, !dbg !53 + %532 = bitcast float %530 to i32, !dbg !38 + %533 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %532, i32 8, i32 31), !dbg !38 + %534 = bitcast i32 %533 to float, !dbg !38 + %535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %531, i32 8, i32 31), !dbg !38 + %536 = fcmp ogt float %530, %534, !dbg !40 + %537 = fcmp oeq float %530, %534, !dbg !41 + %538 = fcmp uno float %530, 0.000000e+00, !dbg !42 + %539 = fcmp uno float %534, 0.000000e+00, !dbg !43 + %540 = xor i1 %539, true, !dbg !44 + %541 = and i1 %538, %540, !dbg !46 + %542 = or i1 %536, %541, !dbg !47 + %543 = and i1 %539, %538, !dbg !45 + %544 = or i1 %537, %543, !dbg !48 + %545 = icmp slt i32 %531, %535, !dbg !49 + %546 = and i1 %545, %544, !dbg !50 + %547 = or i1 %542, %546, !dbg !51 + %548 = select i1 %547, float %530, float %534, !dbg !52 + %549 = select i1 %547, i32 %531, i32 %535, !dbg !53 + %550 = bitcast float %548 to i32, !dbg !38 + %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 4, i32 31), !dbg !38 + %552 = bitcast i32 %551 to float, !dbg !38 + %553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 4, i32 31), !dbg !38 + %554 = fcmp ogt float %548, %552, !dbg !40 + %555 = fcmp oeq float %548, %552, !dbg !41 + %556 = fcmp uno float %548, 0.000000e+00, !dbg !42 + %557 = fcmp uno float %552, 0.000000e+00, !dbg !43 + %558 = xor i1 %557, true, !dbg !44 + %559 = and i1 %556, %558, !dbg !46 + %560 = or i1 %554, %559, !dbg !47 + %561 = and i1 %557, %556, !dbg !45 + %562 = or i1 %555, %561, !dbg !48 + %563 = icmp slt i32 %549, %553, !dbg !49 + %564 = and i1 %563, %562, !dbg !50 + %565 = or i1 %560, %564, !dbg !51 + %566 = select i1 %565, float %548, float %552, !dbg !52 + %567 = select i1 %565, i32 %549, i32 %553, !dbg !53 + %568 = bitcast float %566 to i32, !dbg !38 + %569 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %568, i32 2, i32 31), !dbg !38 + %570 = bitcast i32 %569 to float, !dbg !38 + %571 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %567, i32 2, i32 31), !dbg !38 + %572 = fcmp ogt float %566, %570, !dbg !40 + %573 = fcmp oeq float %566, %570, !dbg !41 + %574 = fcmp uno float %566, 0.000000e+00, !dbg !42 + %575 = fcmp uno float %570, 0.000000e+00, !dbg !43 + %576 = xor i1 %575, true, !dbg !44 + %577 = and i1 %574, %576, !dbg !46 + %578 = or i1 %572, %577, !dbg !47 + %579 = and i1 %575, %574, !dbg !45 + %580 = or i1 %573, %579, !dbg !48 + %581 = icmp slt i32 %567, %571, !dbg !49 + %582 = and i1 %581, %580, !dbg !50 + %583 = or i1 %578, %582, !dbg !51 + %584 = select i1 %583, float %566, float %570, !dbg !52 + %585 = select i1 %583, i32 %567, i32 %571, !dbg !53 + %586 = bitcast float %584 to i32, !dbg !38 + %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 1, i32 31), !dbg !38 + %588 = bitcast i32 %587 to float, !dbg !38 + %589 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %585, i32 1, i32 31), !dbg !38 + %590 = fcmp ogt float %584, %588, !dbg !40 + %591 = fcmp oeq float %584, %588, !dbg !41 + %592 = fcmp uno float %584, 0.000000e+00, !dbg !42 + %593 = fcmp uno float %588, 0.000000e+00, !dbg !43 + %594 = xor i1 %593, true, !dbg !44 + %595 = and i1 %592, %594, !dbg !46 + %596 = or i1 %590, %595, !dbg !47 + %597 = and i1 %593, %592, !dbg !45 + %598 = or i1 %591, %597, !dbg !48 + %599 = icmp slt i32 %585, %589, !dbg !49 + %600 = and i1 %599, %598, !dbg !50 + %601 = or i1 %596, %600, !dbg !51 + %602 = select i1 %601, i32 %585, i32 %589, !dbg !53 + %603 = extractelement <8 x float> %139, i64 2, !dbg !38 + %604 = bitcast float %603 to i32, !dbg !38 + %605 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %604, i32 16, i32 31), !dbg !38 + %606 = bitcast i32 %605 to float, !dbg !38 + %607 = extractelement <8 x i32> %140, i64 2, !dbg !38 + %608 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %607, i32 16, i32 31), !dbg !38 + %609 = fcmp ogt float %603, %606, !dbg !40 + %610 = fcmp oeq float %603, %606, !dbg !41 + %611 = fcmp uno float %606, 0.000000e+00, !dbg !43 + %612 = xor i1 %611, true, !dbg !44 + %613 = extractelement <8 x i1> %155, i64 2, !dbg !45 + %614 = and i1 %613, %612, !dbg !46 + %615 = or i1 %609, %614, !dbg !47 + %616 = and i1 %613, %611, !dbg !45 + %617 = or i1 %610, %616, !dbg !48 + %618 = icmp slt i32 %607, %608, !dbg !49 + %619 = and i1 %618, %617, !dbg !50 + %620 = or i1 %615, %619, !dbg !51 + %621 = select i1 %620, float %603, float %606, !dbg !52 + %622 = select i1 %620, i32 %607, i32 %608, !dbg !53 + %623 = bitcast float %621 to i32, !dbg !38 + %624 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 8, i32 31), !dbg !38 + %625 = bitcast i32 %624 to float, !dbg !38 + %626 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %622, i32 8, i32 31), !dbg !38 + %627 = fcmp ogt float %621, %625, !dbg !40 + %628 = fcmp oeq float %621, %625, !dbg !41 + %629 = fcmp uno float %621, 0.000000e+00, !dbg !42 + %630 = fcmp uno float %625, 0.000000e+00, !dbg !43 + %631 = xor i1 %630, true, !dbg !44 + %632 = and i1 %629, %631, !dbg !46 + %633 = or i1 %627, %632, !dbg !47 + %634 = and i1 %630, %629, !dbg !45 + %635 = or i1 %628, %634, !dbg !48 + %636 = icmp slt i32 %622, %626, !dbg !49 + %637 = and i1 %636, %635, !dbg !50 + %638 = or i1 %633, %637, !dbg !51 + %639 = select i1 %638, float %621, float %625, !dbg !52 + %640 = select i1 %638, i32 %622, i32 %626, !dbg !53 + %641 = bitcast float %639 to i32, !dbg !38 + %642 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %641, i32 4, i32 31), !dbg !38 + %643 = bitcast i32 %642 to float, !dbg !38 + %644 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 4, i32 31), !dbg !38 + %645 = fcmp ogt float %639, %643, !dbg !40 + %646 = fcmp oeq float %639, %643, !dbg !41 + %647 = fcmp uno float %639, 0.000000e+00, !dbg !42 + %648 = fcmp uno float %643, 0.000000e+00, !dbg !43 + %649 = xor i1 %648, true, !dbg !44 + %650 = and i1 %647, %649, !dbg !46 + %651 = or i1 %645, %650, !dbg !47 + %652 = and i1 %648, %647, !dbg !45 + %653 = or i1 %646, %652, !dbg !48 + %654 = icmp slt i32 %640, %644, !dbg !49 + %655 = and i1 %654, %653, !dbg !50 + %656 = or i1 %651, %655, !dbg !51 + %657 = select i1 %656, float %639, float %643, !dbg !52 + %658 = select i1 %656, i32 %640, i32 %644, !dbg !53 + %659 = bitcast float %657 to i32, !dbg !38 + %660 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %659, i32 2, i32 31), !dbg !38 + %661 = bitcast i32 %660 to float, !dbg !38 + %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 2, i32 31), !dbg !38 + %663 = fcmp ogt float %657, %661, !dbg !40 + %664 = fcmp oeq float %657, %661, !dbg !41 + %665 = fcmp uno float %657, 0.000000e+00, !dbg !42 + %666 = fcmp uno float %661, 0.000000e+00, !dbg !43 + %667 = xor i1 %666, true, !dbg !44 + %668 = and i1 %665, %667, !dbg !46 + %669 = or i1 %663, %668, !dbg !47 + %670 = and i1 %666, %665, !dbg !45 + %671 = or i1 %664, %670, !dbg !48 + %672 = icmp slt i32 %658, %662, !dbg !49 + %673 = and i1 %672, %671, !dbg !50 + %674 = or i1 %669, %673, !dbg !51 + %675 = select i1 %674, float %657, float %661, !dbg !52 + %676 = select i1 %674, i32 %658, i32 %662, !dbg !53 + %677 = bitcast float %675 to i32, !dbg !38 + %678 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %677, i32 1, i32 31), !dbg !38 + %679 = bitcast i32 %678 to float, !dbg !38 + %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 1, i32 31), !dbg !38 + %681 = fcmp ogt float %675, %679, !dbg !40 + %682 = fcmp oeq float %675, %679, !dbg !41 + %683 = fcmp uno float %675, 0.000000e+00, !dbg !42 + %684 = fcmp uno float %679, 0.000000e+00, !dbg !43 + %685 = xor i1 %684, true, !dbg !44 + %686 = and i1 %683, %685, !dbg !46 + %687 = or i1 %681, %686, !dbg !47 + %688 = and i1 %684, %683, !dbg !45 + %689 = or i1 %682, %688, !dbg !48 + %690 = icmp slt i32 %676, %680, !dbg !49 + %691 = and i1 %690, %689, !dbg !50 + %692 = or i1 %687, %691, !dbg !51 + %693 = select i1 %692, i32 %676, i32 %680, !dbg !53 + %694 = extractelement <8 x float> %139, i64 1, !dbg !38 + %695 = bitcast float %694 to i32, !dbg !38 + %696 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %695, i32 16, i32 31), !dbg !38 + %697 = bitcast i32 %696 to float, !dbg !38 + %698 = extractelement <8 x i32> %140, i64 1, !dbg !38 + %699 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %698, i32 16, i32 31), !dbg !38 + %700 = fcmp ogt float %694, %697, !dbg !40 + %701 = fcmp oeq float %694, %697, !dbg !41 + %702 = fcmp uno float %697, 0.000000e+00, !dbg !43 + %703 = xor i1 %702, true, !dbg !44 + %704 = extractelement <8 x i1> %155, i64 1, !dbg !45 + %705 = and i1 %704, %703, !dbg !46 + %706 = or i1 %700, %705, !dbg !47 + %707 = and i1 %704, %702, !dbg !45 + %708 = or i1 %701, %707, !dbg !48 + %709 = icmp slt i32 %698, %699, !dbg !49 + %710 = and i1 %709, %708, !dbg !50 + %711 = or i1 %706, %710, !dbg !51 + %712 = select i1 %711, float %694, float %697, !dbg !52 + %713 = select i1 %711, i32 %698, i32 %699, !dbg !53 + %714 = bitcast float %712 to i32, !dbg !38 + %715 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %714, i32 8, i32 31), !dbg !38 + %716 = bitcast i32 %715 to float, !dbg !38 + %717 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %713, i32 8, i32 31), !dbg !38 + %718 = fcmp ogt float %712, %716, !dbg !40 + %719 = fcmp oeq float %712, %716, !dbg !41 + %720 = fcmp uno float %712, 0.000000e+00, !dbg !42 + %721 = fcmp uno float %716, 0.000000e+00, !dbg !43 + %722 = xor i1 %721, true, !dbg !44 + %723 = and i1 %720, %722, !dbg !46 + %724 = or i1 %718, %723, !dbg !47 + %725 = and i1 %721, %720, !dbg !45 + %726 = or i1 %719, %725, !dbg !48 + %727 = icmp slt i32 %713, %717, !dbg !49 + %728 = and i1 %727, %726, !dbg !50 + %729 = or i1 %724, %728, !dbg !51 + %730 = select i1 %729, float %712, float %716, !dbg !52 + %731 = select i1 %729, i32 %713, i32 %717, !dbg !53 + %732 = bitcast float %730 to i32, !dbg !38 + %733 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %732, i32 4, i32 31), !dbg !38 + %734 = bitcast i32 %733 to float, !dbg !38 + %735 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %731, i32 4, i32 31), !dbg !38 + %736 = fcmp ogt float %730, %734, !dbg !40 + %737 = fcmp oeq float %730, %734, !dbg !41 + %738 = fcmp uno float %730, 0.000000e+00, !dbg !42 + %739 = fcmp uno float %734, 0.000000e+00, !dbg !43 + %740 = xor i1 %739, true, !dbg !44 + %741 = and i1 %738, %740, !dbg !46 + %742 = or i1 %736, %741, !dbg !47 + %743 = and i1 %739, %738, !dbg !45 + %744 = or i1 %737, %743, !dbg !48 + %745 = icmp slt i32 %731, %735, !dbg !49 + %746 = and i1 %745, %744, !dbg !50 + %747 = or i1 %742, %746, !dbg !51 + %748 = select i1 %747, float %730, float %734, !dbg !52 + %749 = select i1 %747, i32 %731, i32 %735, !dbg !53 + %750 = bitcast float %748 to i32, !dbg !38 + %751 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %750, i32 2, i32 31), !dbg !38 + %752 = bitcast i32 %751 to float, !dbg !38 + %753 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %749, i32 2, i32 31), !dbg !38 + %754 = fcmp ogt float %748, %752, !dbg !40 + %755 = fcmp oeq float %748, %752, !dbg !41 + %756 = fcmp uno float %748, 0.000000e+00, !dbg !42 + %757 = fcmp uno float %752, 0.000000e+00, !dbg !43 + %758 = xor i1 %757, true, !dbg !44 + %759 = and i1 %756, %758, !dbg !46 + %760 = or i1 %754, %759, !dbg !47 + %761 = and i1 %757, %756, !dbg !45 + %762 = or i1 %755, %761, !dbg !48 + %763 = icmp slt i32 %749, %753, !dbg !49 + %764 = and i1 %763, %762, !dbg !50 + %765 = or i1 %760, %764, !dbg !51 + %766 = select i1 %765, float %748, float %752, !dbg !52 + %767 = select i1 %765, i32 %749, i32 %753, !dbg !53 + %768 = bitcast float %766 to i32, !dbg !38 + %769 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %768, i32 1, i32 31), !dbg !38 + %770 = bitcast i32 %769 to float, !dbg !38 + %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 1, i32 31), !dbg !38 + %772 = fcmp ogt float %766, %770, !dbg !40 + %773 = fcmp oeq float %766, %770, !dbg !41 + %774 = fcmp uno float %766, 0.000000e+00, !dbg !42 + %775 = fcmp uno float %770, 0.000000e+00, !dbg !43 + %776 = xor i1 %775, true, !dbg !44 + %777 = and i1 %774, %776, !dbg !46 + %778 = or i1 %772, %777, !dbg !47 + %779 = and i1 %775, %774, !dbg !45 + %780 = or i1 %773, %779, !dbg !48 + %781 = icmp slt i32 %767, %771, !dbg !49 + %782 = and i1 %781, %780, !dbg !50 + %783 = or i1 %778, %782, !dbg !51 + %784 = select i1 %783, i32 %767, i32 %771, !dbg !53 + %785 = extractelement <8 x float> %139, i64 0, !dbg !38 + %786 = bitcast float %785 to i32, !dbg !38 + %787 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %786, i32 16, i32 31), !dbg !38 + %788 = bitcast i32 %787 to float, !dbg !38 + %789 = extractelement <8 x i32> %140, i64 0, !dbg !38 + %790 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %789, i32 16, i32 31), !dbg !38 + %791 = fcmp ogt float %785, %788, !dbg !40 + %792 = fcmp oeq float %785, %788, !dbg !41 + %793 = fcmp uno float %788, 0.000000e+00, !dbg !43 + %794 = xor i1 %793, true, !dbg !44 + %795 = extractelement <8 x i1> %155, i64 0, !dbg !45 + %796 = and i1 %795, %794, !dbg !46 + %797 = or i1 %791, %796, !dbg !47 + %798 = and i1 %795, %793, !dbg !45 + %799 = or i1 %792, %798, !dbg !48 + %800 = icmp slt i32 %789, %790, !dbg !49 + %801 = and i1 %800, %799, !dbg !50 + %802 = or i1 %797, %801, !dbg !51 + %803 = select i1 %802, float %785, float %788, !dbg !52 + %804 = select i1 %802, i32 %789, i32 %790, !dbg !53 + %805 = bitcast float %803 to i32, !dbg !38 + %806 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %805, i32 8, i32 31), !dbg !38 + %807 = bitcast i32 %806 to float, !dbg !38 + %808 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %804, i32 8, i32 31), !dbg !38 + %809 = fcmp ogt float %803, %807, !dbg !40 + %810 = fcmp oeq float %803, %807, !dbg !41 + %811 = fcmp uno float %803, 0.000000e+00, !dbg !42 + %812 = fcmp uno float %807, 0.000000e+00, !dbg !43 + %813 = xor i1 %812, true, !dbg !44 + %814 = and i1 %811, %813, !dbg !46 + %815 = or i1 %809, %814, !dbg !47 + %816 = and i1 %812, %811, !dbg !45 + %817 = or i1 %810, %816, !dbg !48 + %818 = icmp slt i32 %804, %808, !dbg !49 + %819 = and i1 %818, %817, !dbg !50 + %820 = or i1 %815, %819, !dbg !51 + %821 = select i1 %820, float %803, float %807, !dbg !52 + %822 = select i1 %820, i32 %804, i32 %808, !dbg !53 + %823 = bitcast float %821 to i32, !dbg !38 + %824 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 4, i32 31), !dbg !38 + %825 = bitcast i32 %824 to float, !dbg !38 + %826 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %822, i32 4, i32 31), !dbg !38 + %827 = fcmp ogt float %821, %825, !dbg !40 + %828 = fcmp oeq float %821, %825, !dbg !41 + %829 = fcmp uno float %821, 0.000000e+00, !dbg !42 + %830 = fcmp uno float %825, 0.000000e+00, !dbg !43 + %831 = xor i1 %830, true, !dbg !44 + %832 = and i1 %829, %831, !dbg !46 + %833 = or i1 %827, %832, !dbg !47 + %834 = and i1 %830, %829, !dbg !45 + %835 = or i1 %828, %834, !dbg !48 + %836 = icmp slt i32 %822, %826, !dbg !49 + %837 = and i1 %836, %835, !dbg !50 + %838 = or i1 %833, %837, !dbg !51 + %839 = select i1 %838, float %821, float %825, !dbg !52 + %840 = select i1 %838, i32 %822, i32 %826, !dbg !53 + %841 = bitcast float %839 to i32, !dbg !38 + %842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %841, i32 2, i32 31), !dbg !38 + %843 = bitcast i32 %842 to float, !dbg !38 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 2, i32 31), !dbg !38 + %845 = fcmp ogt float %839, %843, !dbg !40 + %846 = fcmp oeq float %839, %843, !dbg !41 + %847 = fcmp uno float %839, 0.000000e+00, !dbg !42 + %848 = fcmp uno float %843, 0.000000e+00, !dbg !43 + %849 = xor i1 %848, true, !dbg !44 + %850 = and i1 %847, %849, !dbg !46 + %851 = or i1 %845, %850, !dbg !47 + %852 = and i1 %848, %847, !dbg !45 + %853 = or i1 %846, %852, !dbg !48 + %854 = icmp slt i32 %840, %844, !dbg !49 + %855 = and i1 %854, %853, !dbg !50 + %856 = or i1 %851, %855, !dbg !51 + %857 = select i1 %856, float %839, float %843, !dbg !52 + %858 = select i1 %856, i32 %840, i32 %844, !dbg !53 + %859 = bitcast float %857 to i32, !dbg !38 + %860 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %859, i32 1, i32 31), !dbg !38 + %861 = bitcast i32 %860 to float, !dbg !38 + %862 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %858, i32 1, i32 31), !dbg !38 + %863 = fcmp ogt float %857, %861, !dbg !40 + %864 = fcmp oeq float %857, %861, !dbg !41 + %865 = fcmp uno float %857, 0.000000e+00, !dbg !42 + %866 = fcmp uno float %861, 0.000000e+00, !dbg !43 + %867 = xor i1 %866, true, !dbg !44 + %868 = and i1 %865, %867, !dbg !46 + %869 = or i1 %863, %868, !dbg !47 + %870 = and i1 %866, %865, !dbg !45 + %871 = or i1 %864, %870, !dbg !48 + %872 = icmp slt i32 %858, %862, !dbg !49 + %873 = and i1 %872, %871, !dbg !50 + %874 = or i1 %869, %873, !dbg !51 + %875 = select i1 %874, i32 %858, i32 %862, !dbg !53 + %876 = and i32 %146, 1, !dbg !38 + %877 = icmp eq i32 %145, 0, !dbg !38 + %878 = lshr exact i32 %12, 5, !dbg !38 + %879 = or disjoint i32 %878, %876, !dbg !38 + %880 = getelementptr float, ptr addrspace(3) @global_smem, i32 %879, !dbg !38 + %881 = select i1 %237, i32 %222, i32 %223, !dbg !52 + %882 = insertelement <1 x i32> poison, i32 %881, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %880, <1 x i32> %882, i1 %877) #4, !dbg !38 + %883 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %879, !dbg !38 + %884 = insertelement <1 x i32> poison, i32 %238, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %883, <1 x i32> %884, i1 %877) #4, !dbg !38 + %885 = shl nuw nsw i32 %15, 1, !dbg !38 + %886 = or disjoint i32 %885, %876, !dbg !38 + %887 = getelementptr float, ptr addrspace(3) @global_smem, i32 %886, !dbg !38 + %888 = select i1 %328, i32 %313, i32 %314, !dbg !52 + %889 = insertelement <1 x i32> poison, i32 %888, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %887, <1 x i32> %889, i1 %877) #4, !dbg !38 + %890 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %886, !dbg !38 + %891 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %890, <1 x i32> %891, i1 %877) #4, !dbg !38 + %892 = shl nuw nsw i32 %16, 1, !dbg !38 + %893 = or disjoint i32 %892, %876, !dbg !38 + %894 = getelementptr float, ptr addrspace(3) @global_smem, i32 %893, !dbg !38 + %895 = select i1 %419, i32 %404, i32 %405, !dbg !52 + %896 = insertelement <1 x i32> poison, i32 %895, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %894, <1 x i32> %896, i1 %877) #4, !dbg !38 + %897 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %893, !dbg !38 + %898 = insertelement <1 x i32> poison, i32 %420, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %897, <1 x i32> %898, i1 %877) #4, !dbg !38 + %899 = shl nuw nsw i32 %17, 1, !dbg !38 + %900 = or disjoint i32 %899, %876, !dbg !38 + %901 = getelementptr float, ptr addrspace(3) @global_smem, i32 %900, !dbg !38 + %902 = select i1 %510, i32 %495, i32 %496, !dbg !52 + %903 = insertelement <1 x i32> poison, i32 %902, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %901, <1 x i32> %903, i1 %877) #4, !dbg !38 + %904 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %900, !dbg !38 + %905 = insertelement <1 x i32> poison, i32 %511, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %904, <1 x i32> %905, i1 %877) #4, !dbg !38 + %906 = extractelement <4 x i32> %20, i64 3, !dbg !38 + %907 = shl nuw nsw i32 %906, 1, !dbg !38 + %908 = or disjoint i32 %907, %876, !dbg !38 + %909 = getelementptr float, ptr addrspace(3) @global_smem, i32 %908, !dbg !38 + %910 = select i1 %601, i32 %586, i32 %587, !dbg !52 + %911 = insertelement <1 x i32> poison, i32 %910, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %909, <1 x i32> %911, i1 %877) #4, !dbg !38 + %912 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %908, !dbg !38 + %913 = insertelement <1 x i32> poison, i32 %602, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %912, <1 x i32> %913, i1 %877) #4, !dbg !38 + %914 = extractelement <4 x i32> %20, i64 2, !dbg !38 + %915 = shl nuw nsw i32 %914, 1, !dbg !38 + %916 = or disjoint i32 %915, %876, !dbg !38 + %917 = getelementptr float, ptr addrspace(3) @global_smem, i32 %916, !dbg !38 + %918 = select i1 %692, i32 %677, i32 %678, !dbg !52 + %919 = insertelement <1 x i32> poison, i32 %918, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %917, <1 x i32> %919, i1 %877) #4, !dbg !38 + %920 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %916, !dbg !38 + %921 = insertelement <1 x i32> poison, i32 %693, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %920, <1 x i32> %921, i1 %877) #4, !dbg !38 + %922 = extractelement <4 x i32> %20, i64 1, !dbg !38 + %923 = shl nuw nsw i32 %922, 1, !dbg !38 + %924 = or disjoint i32 %923, %876, !dbg !38 + %925 = getelementptr float, ptr addrspace(3) @global_smem, i32 %924, !dbg !38 + %926 = select i1 %783, i32 %768, i32 %769, !dbg !52 + %927 = insertelement <1 x i32> poison, i32 %926, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %925, <1 x i32> %927, i1 %877) #4, !dbg !38 + %928 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %924, !dbg !38 + %929 = insertelement <1 x i32> poison, i32 %784, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %928, <1 x i32> %929, i1 %877) #4, !dbg !38 + %930 = extractelement <4 x i32> %20, i64 0, !dbg !38 + %931 = shl nuw nsw i32 %930, 1, !dbg !38 + %932 = or disjoint i32 %931, %876, !dbg !38 + %933 = getelementptr float, ptr addrspace(3) @global_smem, i32 %932, !dbg !38 + %934 = select i1 %874, i32 %859, i32 %860, !dbg !52 + %935 = insertelement <1 x i32> poison, i32 %934, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %933, <1 x i32> %935, i1 %877) #4, !dbg !38 + %936 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %932, !dbg !38 + %937 = insertelement <1 x i32> poison, i32 %875, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %936, <1 x i32> %937, i1 %877) #4, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %938 = icmp samesign ult i32 %11, 128, !dbg !38 + %939 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !38 + %940 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %939, i1 %938) #4, !dbg !38 + %941 = bitcast i32 %940 to float, !dbg !38 + %942 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %11, !dbg !38 + %943 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %942, i1 %938) #4, !dbg !38 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %940, i32 1, i32 31), !dbg !38 + %945 = bitcast i32 %944 to float, !dbg !38 + %946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 1, i32 31), !dbg !38 + %947 = fcmp ogt float %941, %945, !dbg !40 + %948 = fcmp oeq float %941, %945, !dbg !41 + %949 = fcmp uno float %941, 0.000000e+00, !dbg !42 + %950 = fcmp uno float %945, 0.000000e+00, !dbg !43 + %951 = xor i1 %950, true, !dbg !44 + %952 = and i1 %949, %951, !dbg !46 + %953 = or i1 %947, %952, !dbg !47 + %954 = and i1 %949, %950, !dbg !45 + %955 = or i1 %948, %954, !dbg !48 + %956 = icmp slt i32 %943, %946, !dbg !49 + %957 = and i1 %956, %955, !dbg !50 + %958 = or i1 %953, %957, !dbg !51 + %959 = select i1 %958, i32 %943, i32 %946, !dbg !53 + %960 = and i32 %11, 897, !dbg !38 + %961 = icmp eq i32 %960, 0, !dbg !38 + %962 = select i1 %958, i32 %940, i32 %944, !dbg !52 + %963 = insertelement <1 x i32> poison, i32 %962, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %939, <1 x i32> %963, i1 %961) #4, !dbg !38 + %964 = insertelement <1 x i32> poison, i32 %959, i64 0, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %942, <1 x i32> %964, i1 %961) #4, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %965 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %878, !dbg !38 + %966 = load i32, ptr addrspace(3) %965, align 8, !dbg !38 + %967 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %885, !dbg !38 + %968 = load i32, ptr addrspace(3) %967, align 8, !dbg !38 + %969 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %892, !dbg !38 + %970 = load i32, ptr addrspace(3) %969, align 8, !dbg !38 + %971 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %899, !dbg !38 + %972 = load i32, ptr addrspace(3) %971, align 8, !dbg !38 + %973 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %907, !dbg !38 + %974 = load i32, ptr addrspace(3) %973, align 8, !dbg !38 + %975 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %915, !dbg !38 + %976 = load i32, ptr addrspace(3) %975, align 8, !dbg !38 + %977 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %923, !dbg !38 + %978 = load i32, ptr addrspace(3) %977, align 8, !dbg !38 + %979 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %931, !dbg !38 + %980 = load i32, ptr addrspace(3) %979, align 8, !dbg !38 + %981 = sext i32 %143 to i64, !dbg !54 + %982 = getelementptr i64, ptr addrspace(1) %1, i64 %981, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %983 = lshr exact i32 %12, 2, !dbg !55 + %984 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %983, !dbg !55 + %985 = insertelement <4 x i32> poison, i32 %966, i64 0, !dbg !55 + %986 = insertelement <4 x i32> %985, i32 %968, i64 1, !dbg !55 + %987 = insertelement <4 x i32> %986, i32 %970, i64 2, !dbg !55 + %988 = insertelement <4 x i32> %987, i32 %972, i64 3, !dbg !55 + store <4 x i32> %988, ptr addrspace(3) %984, align 16, !dbg !55 + %989 = getelementptr inbounds nuw i8, ptr addrspace(3) %984, i32 128, !dbg !55 + %990 = insertelement <4 x i32> poison, i32 %974, i64 0, !dbg !55 + %991 = insertelement <4 x i32> %990, i32 %976, i64 1, !dbg !55 + %992 = insertelement <4 x i32> %991, i32 %978, i64 2, !dbg !55 + %993 = insertelement <4 x i32> %992, i32 %980, i64 3, !dbg !55 + store <4 x i32> %993, ptr addrspace(3) %989, align 16, !dbg !55 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %994 = shl nuw nsw i32 %11, 4, !dbg !55 + %995 = and i32 %994, 112, !dbg !55 + %996 = lshr i32 %11, 1, !dbg !55 + %997 = and i32 %996, 12, !dbg !55 + %998 = shl nuw nsw i32 %11, 2, !dbg !55 + %999 = and i32 %998, 128, !dbg !55 + %1000 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %995, !dbg !55 + %1001 = getelementptr inbounds nuw i8, ptr addrspace(3) %1000, i32 %999, !dbg !55 + %1002 = getelementptr inbounds nuw i8, ptr addrspace(3) %1001, i32 %997, !dbg !55 + %1003 = load i32, ptr addrspace(3) %1002, align 4, !dbg !55 + %1004 = sext i32 %1003 to i64, !dbg !55 + %1005 = icmp eq i32 %12, 0, !dbg !55 + %1006 = and i1 %1005, %144, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %1004, ptr addrspace(1) %982, i1 %1006) #4, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 27, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 56, scope: !4) +!15 = !DILocation(line: 32, column: 40, scope: !4) +!16 = !DILocation(line: 38, column: 61, scope: !4) +!17 = !DILocation(line: 33, column: 31, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 41, column: 38, scope: !4) +!23 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !22) +!26 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !22) +!27 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !22) +!28 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !22) +!29 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 43, column: 54, scope: !4) +!37 = !DILocation(line: 44, column: 66, scope: !4) +!38 = !DILocation(line: 165, column: 42, scope: !20, inlinedAt: !39) +!39 = !DILocation(line: 45, column: 75, scope: !4) +!40 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !39) +!41 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !39) +!42 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !39) +!43 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !39) +!44 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !39) +!45 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !39) +!46 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !39) +!47 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !39) +!48 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !39) +!49 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !39) +!50 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !39) +!51 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !39) +!52 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !39) +!53 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !39) +!54 = !DILocation(line: 47, column: 25, scope: !4) +!55 = !DILocation(line: 47, column: 36, scope: !4) +!56 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..569951c2b9b49acd37b359d808b2bdef804afb5c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,2198 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<645>; + .reg .b32 %r<399>; + .reg .b64 %rd<214>; +$L__func_begin0: + +// %bb.0: + .loc 1 22 28 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:28 + mov.u32 %r45, %ctaid.x; + .loc 1 22 33 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:33 + shl.b32 %r1, %r45, 6; + ld.param.b64 %rd87, [triton_red_fused_argmax_1_param_2]; + .loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r46, %r2, 6, 3; + or.b32 %r5, %r46, 8; + or.b32 %r6, %r46, 16; + or.b32 %r7, %r46, 24; + .loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23 + or.b32 %r52, %r6, %r1; + or.b32 %r53, %r5, %r1; + or.b32 %r54, %r46, %r1; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd8, %r54; + cvt.s64.s32 %rd1, %r53; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd88, %rd8, %rd87; + and.b64 %rd89, %rd88, -4294967296; + setp.ne.b64 %p9, %rd89, 0; + @%p9 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd193, %rd8, %rd87; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r55, %rd87; + cvt.u32.u64 %r56, %rd8; + div.u32 %r57, %r56, %r55; + cvt.u64.u32 %rd193, %r57; +$L__BB0_3: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r11, %r46, 32; + or.b32 %r51, %r7, %r1; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd2, %r52; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd91, %rd1, %rd87; + and.b64 %rd92, %rd91, -4294967296; + setp.ne.b64 %p10, %rd92, 0; + @%p10 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd194, %rd1, %rd87; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r58, %rd87; + cvt.u32.u64 %r59, %rd1; + div.u32 %r60, %r59, %r58; + cvt.u64.u32 %rd194, %r60; +$L__BB0_6: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r10, %r46, 40; + or.b32 %r50, %r11, %r1; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd3, %r51; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd94, %rd2, %rd87; + and.b64 %rd95, %rd94, -4294967296; + setp.ne.b64 %p11, %rd95, 0; + @%p11 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd195, %rd2, %rd87; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r61, %rd87; + cvt.u32.u64 %r62, %rd2; + div.u32 %r63, %r62, %r61; + cvt.u64.u32 %rd195, %r63; +$L__BB0_9: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r9, %r46, 48; + or.b32 %r49, %r10, %r1; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd4, %r50; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd97, %rd3, %rd87; + and.b64 %rd98, %rd97, -4294967296; + setp.ne.b64 %p12, %rd98, 0; + @%p12 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + div.s64 %rd196, %rd3, %rd87; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r64, %rd87; + cvt.u32.u64 %r65, %rd3; + div.u32 %r66, %r65, %r64; + cvt.u64.u32 %rd196, %r66; +$L__BB0_12: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r8, %r46, 56; + or.b32 %r48, %r9, %r1; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd5, %r49; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd100, %rd4, %rd87; + and.b64 %rd101, %rd100, -4294967296; + setp.ne.b64 %p13, %rd101, 0; + @%p13 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd197, %rd4, %rd87; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r67, %rd87; + cvt.u32.u64 %r68, %rd4; + div.u32 %r69, %r68, %r67; + cvt.u64.u32 %rd197, %r69; +$L__BB0_15: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r47, %r8, %r1; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd6, %r48; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd103, %rd5, %rd87; + and.b64 %rd104, %rd103, -4294967296; + setp.ne.b64 %p14, %rd104, 0; + @%p14 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd198, %rd5, %rd87; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r70, %rd87; + cvt.u32.u64 %r71, %rd5; + div.u32 %r72, %r71, %r70; + cvt.u64.u32 %rd198, %r72; +$L__BB0_18: + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + mul.lo.s64 %rd90, %rd193, %rd87; + mul.lo.s64 %rd93, %rd194, %rd87; + mul.lo.s64 %rd96, %rd195, %rd87; + mul.lo.s64 %rd99, %rd196, %rd87; + mul.lo.s64 %rd102, %rd197, %rd87; + cvt.s64.s32 %rd7, %r47; + mul.lo.s64 %rd105, %rd198, %rd87; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd106, %rd6, %rd87; + and.b64 %rd107, %rd106, -4294967296; + setp.ne.b64 %p15, %rd107, 0; + @%p15 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + div.s64 %rd199, %rd6, %rd87; + bra.uni $L__BB0_21; +$L__BB0_19: + cvt.u32.u64 %r73, %rd87; + cvt.u32.u64 %r74, %rd6; + div.u32 %r75, %r74, %r73; + cvt.u64.u32 %rd199, %r75; +$L__BB0_21: + .loc 1 0 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0:19 + ld.param.b32 %r44, [triton_red_fused_argmax_1_param_4]; + ld.param.b64 %rd86, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd84, [triton_red_fused_argmax_1_param_0]; + and.b32 %r4, %r2, 63; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + sub.s64 %rd13, %rd8, %rd90; + sub.s64 %rd18, %rd1, %rd93; + sub.s64 %rd23, %rd2, %rd96; + sub.s64 %rd28, %rd3, %rd99; + sub.s64 %rd33, %rd4, %rd102; + sub.s64 %rd38, %rd5, %rd105; + mul.lo.s64 %rd108, %rd199, %rd87; + sub.s64 %rd43, %rd6, %rd108; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd109, %rd7, %rd87; + and.b64 %rd110, %rd109, -4294967296; + setp.ne.b64 %p16, %rd110, 0; + @%p16 bra $L__BB0_23; + bra.uni $L__BB0_22; +$L__BB0_23: + div.s64 %rd200, %rd7, %rd87; + bra.uni $L__BB0_24; +$L__BB0_22: + cvt.u32.u64 %r76, %rd87; + cvt.u32.u64 %r77, %rd7; + div.u32 %r78, %r77, %r76; + cvt.u64.u32 %rd200, %r78; +$L__BB0_24: + .loc 1 0 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0:19 + ld.param.b64 %rd85, [triton_red_fused_argmax_1_param_1]; + and.b32 %r3, %r2, 448; + .loc 1 24 21 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:24:21 + setp.lt.s32 %p24, %r47, %r44; + setp.lt.s32 %p23, %r48, %r44; + setp.lt.s32 %p22, %r49, %r44; + setp.lt.s32 %p21, %r50, %r44; + setp.lt.s32 %p20, %r51, %r44; + setp.lt.s32 %p19, %r52, %r44; + setp.lt.s32 %p18, %r53, %r44; + setp.lt.s32 %p17, %r54, %r44; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + mul.lo.s64 %rd116, %rd200, %rd87; + sub.s64 %rd117, %rd7, %rd116; + .loc 1 38 56 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:56 + mul.lo.s64 %rd118, %rd193, %rd86; + mul.lo.s64 %rd119, %rd194, %rd86; + mul.lo.s64 %rd120, %rd195, %rd86; + mul.lo.s64 %rd121, %rd196, %rd86; + mul.lo.s64 %rd122, %rd197, %rd86; + mul.lo.s64 %rd123, %rd198, %rd86; + mul.lo.s64 %rd124, %rd199, %rd86; + mul.lo.s64 %rd125, %rd200, %rd86; + mad.lo.s64 %rd126, %rd13, 128000, %rd84; + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + shl.b64 %rd127, %rd118, 2; + add.s64 %rd201, %rd126, %rd127; + mad.lo.s64 %rd128, %rd18, 128000, %rd84; + shl.b64 %rd129, %rd119, 2; + add.s64 %rd202, %rd128, %rd129; + mad.lo.s64 %rd130, %rd23, 128000, %rd84; + shl.b64 %rd131, %rd120, 2; + add.s64 %rd203, %rd130, %rd131; + mad.lo.s64 %rd132, %rd28, 128000, %rd84; + shl.b64 %rd133, %rd121, 2; + add.s64 %rd204, %rd132, %rd133; + mad.lo.s64 %rd134, %rd33, 128000, %rd84; + shl.b64 %rd135, %rd122, 2; + add.s64 %rd205, %rd134, %rd135; + mad.lo.s64 %rd136, %rd38, 128000, %rd84; + shl.b64 %rd137, %rd123, 2; + add.s64 %rd206, %rd136, %rd137; + mad.lo.s64 %rd138, %rd43, 128000, %rd84; + shl.b64 %rd139, %rd124, 2; + add.s64 %rd207, %rd138, %rd139; + mad.lo.s64 %rd140, %rd117, 128000, %rd84; + shl.b64 %rd141, %rd125, 2; + add.s64 %rd208, %rd140, %rd141; + cvt.u64.u32 %rd56, %r4; + mov.b32 %r87, 0fFF800000; + mov.b64 %rd210, {%r87, %r87}; + mul.wide.u32 %rd57, %r4, 4; + mov.b32 %r391, 2147483647; + mov.b64 %rd209, 0; + mov.b64 %rd211, %rd210; + mov.b64 %rd212, %rd210; + mov.b64 %rd213, %rd210; + mov.b32 %r392, %r391; + mov.b32 %r393, %r391; + mov.b32 %r394, %r391; + mov.b32 %r395, %r391; + mov.b32 %r396, %r391; + mov.b32 %r397, %r391; + mov.b32 %r398, %r391; +$L__BB0_25: // =>This Inner Loop Header: Depth=1 + .loc 1 38 34 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:34 + add.s64 %rd166, %rd56, %rd209; + add.s64 %rd143, %rd201, %rd57; + add.s64 %rd146, %rd202, %rd57; + add.s64 %rd149, %rd203, %rd57; + add.s64 %rd152, %rd204, %rd57; + add.s64 %rd155, %rd205, %rd57; + add.s64 %rd158, %rd206, %rd57; + add.s64 %rd161, %rd207, %rd57; + .loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61 + add.s64 %rd164, %rd208, %rd57; + // begin inline asm + mov.u64 %rd142, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd142, 1.0; + // end inline asm + mov.b32 %r89, 0; + // begin inline asm + mov.u32 %r88, %r89; + @%p17 ld.global.L1::evict_first.L2::cache_hint.b32 { %r88 }, [ %rd143 + 0 ], %rd142; + // end inline asm + // begin inline asm + mov.u64 %rd145, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd145, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r90, %r89; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b32 { %r90 }, [ %rd146 + 0 ], %rd145; + // end inline asm + // begin inline asm + mov.u64 %rd148, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd148, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r92, %r89; + @%p19 ld.global.L1::evict_first.L2::cache_hint.b32 { %r92 }, [ %rd149 + 0 ], %rd148; + // end inline asm + // begin inline asm + mov.u64 %rd151, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd151, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r94, %r89; + @%p20 ld.global.L1::evict_first.L2::cache_hint.b32 { %r94 }, [ %rd152 + 0 ], %rd151; + // end inline asm + // begin inline asm + mov.u64 %rd154, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd154, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r96, %r89; + @%p21 ld.global.L1::evict_first.L2::cache_hint.b32 { %r96 }, [ %rd155 + 0 ], %rd154; + // end inline asm + // begin inline asm + mov.u64 %rd157, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd157, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r98, %r89; + @%p22 ld.global.L1::evict_first.L2::cache_hint.b32 { %r98 }, [ %rd158 + 0 ], %rd157; + // end inline asm + // begin inline asm + mov.u64 %rd160, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd160, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r100, %r89; + @%p23 ld.global.L1::evict_first.L2::cache_hint.b32 { %r100 }, [ %rd161 + 0 ], %rd160; + // end inline asm + // begin inline asm + mov.u64 %rd163, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd163, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r102, %r89; + @%p24 ld.global.L1::evict_first.L2::cache_hint.b32 { %r102 }, [ %rd164 + 0 ], %rd163; + // end inline asm +$L__tmp0: + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r104, %r105}, %rd210; + setp.nan.f32 %p33, %r104, %r104; + setp.nan.f32 %p34, %r105, %r105; + mov.b64 {%r106, %r107}, %rd211; + setp.nan.f32 %p35, %r106, %r106; + setp.nan.f32 %p36, %r107, %r107; + mov.b64 {%r108, %r109}, %rd212; + setp.nan.f32 %p37, %r108, %r108; + setp.nan.f32 %p38, %r109, %r109; + mov.b64 {%r110, %r111}, %rd213; + setp.nan.f32 %p39, %r110, %r110; + setp.nan.f32 %p40, %r111, %r111; +$L__tmp1: + .loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61 + cvt.u64.u32 %rd167, %r100; + shl.b64 %rd168, %rd167, 32; + cvt.u64.u32 %rd169, %r102; + or.b64 %rd170, %rd169, %rd168; + cvt.u64.u32 %rd171, %r96; + shl.b64 %rd172, %rd171, 32; + cvt.u64.u32 %rd173, %r98; + or.b64 %rd174, %rd173, %rd172; + cvt.u64.u32 %rd175, %r92; + shl.b64 %rd176, %rd175, 32; + cvt.u64.u32 %rd177, %r94; + or.b64 %rd178, %rd177, %rd176; + cvt.u64.u32 %rd179, %r88; + shl.b64 %rd180, %rd179, 32; + cvt.u64.u32 %rd181, %r90; + or.b64 %rd182, %rd181, %rd180; +$L__tmp2: + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r112, %r113}, %rd182; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p41, %r111, %r113; + setp.gt.f32 %p42, %r110, %r112; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r114, %r115}, %rd178; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p43, %r109, %r115; + setp.gt.f32 %p44, %r108, %r114; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r116, %r117}, %rd174; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p45, %r107, %r117; + setp.gt.f32 %p46, %r106, %r116; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r118, %r119}, %rd170; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p47, %r105, %r119; + setp.gt.f32 %p48, %r104, %r118; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.eq.f32 %p49, %r104, %r118; + setp.eq.f32 %p50, %r105, %r119; + setp.eq.f32 %p51, %r106, %r116; + setp.eq.f32 %p52, %r107, %r117; + setp.eq.f32 %p53, %r108, %r114; + setp.eq.f32 %p54, %r109, %r115; + setp.eq.f32 %p55, %r110, %r112; + setp.eq.f32 %p56, %r111, %r113; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.nan.f32 %p57, %r113, %r113; + setp.nan.f32 %p58, %r112, %r112; + setp.nan.f32 %p59, %r115, %r115; + setp.nan.f32 %p60, %r114, %r114; + setp.nan.f32 %p61, %r117, %r117; + setp.nan.f32 %p62, %r116, %r116; + setp.nan.f32 %p63, %r119, %r119; + setp.nan.f32 %p64, %r118, %r118; + setp.num.f32 %p65, %r118, %r118; + setp.num.f32 %p66, %r119, %r119; + setp.num.f32 %p67, %r116, %r116; + setp.num.f32 %p68, %r117, %r117; + setp.num.f32 %p69, %r114, %r114; + setp.num.f32 %p70, %r115, %r115; + setp.num.f32 %p71, %r112, %r112; + setp.num.f32 %p72, %r113, %r113; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p73, %p40, %p72; + and.pred %p74, %p39, %p71; + and.pred %p75, %p38, %p70; + and.pred %p76, %p37, %p69; + and.pred %p77, %p36, %p68; + and.pred %p78, %p35, %p67; + and.pred %p79, %p34, %p66; + and.pred %p80, %p33, %p65; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p81, %p48, %p80; + or.pred %p82, %p47, %p79; + or.pred %p83, %p46, %p78; + or.pred %p84, %p45, %p77; + or.pred %p85, %p44, %p76; + or.pred %p86, %p43, %p75; + or.pred %p87, %p42, %p74; + or.pred %p88, %p41, %p73; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p89, %p33, %p64; + and.pred %p90, %p34, %p63; + and.pred %p91, %p35, %p62; + and.pred %p92, %p36, %p61; + and.pred %p93, %p37, %p60; + and.pred %p94, %p38, %p59; + and.pred %p95, %p39, %p58; + and.pred %p96, %p40, %p57; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p97, %p56, %p96; + or.pred %p98, %p55, %p95; + or.pred %p99, %p54, %p94; + or.pred %p100, %p53, %p93; + or.pred %p101, %p52, %p92; + or.pred %p102, %p51, %p91; + or.pred %p103, %p50, %p90; + or.pred %p104, %p49, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + cvt.s64.s32 %rd183, %r391; + cvt.s64.s32 %rd184, %r392; + cvt.s64.s32 %rd185, %r393; + cvt.s64.s32 %rd186, %r394; + cvt.s64.s32 %rd187, %r395; + cvt.s64.s32 %rd188, %r396; + cvt.s64.s32 %rd189, %r397; + cvt.s64.s32 %rd190, %r398; + setp.gt.s64 %p105, %rd166, %rd190; + setp.gt.s64 %p106, %rd166, %rd189; + setp.gt.s64 %p107, %rd166, %rd188; + setp.gt.s64 %p108, %rd166, %rd187; + setp.gt.s64 %p109, %rd166, %rd186; + setp.gt.s64 %p110, %rd166, %rd185; + setp.gt.s64 %p111, %rd166, %rd184; + setp.gt.s64 %p112, %rd166, %rd183; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p113, %p112, %p104; + and.pred %p114, %p111, %p103; + and.pred %p115, %p110, %p102; + and.pred %p116, %p109, %p101; + and.pred %p117, %p108, %p100; + and.pred %p118, %p107, %p99; + and.pred %p119, %p106, %p98; + and.pred %p120, %p105, %p97; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p121, %p88, %p120; + or.pred %p122, %p87, %p119; + or.pred %p123, %p86, %p118; + or.pred %p124, %p85, %p117; + or.pred %p125, %p84, %p116; + or.pred %p126, %p83, %p115; + or.pred %p127, %p82, %p114; + or.pred %p128, %p81, %p113; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + selp.f32 %r120, %r104, %r118, %p128; + selp.f32 %r121, %r105, %r119, %p127; + selp.f32 %r122, %r106, %r116, %p126; + selp.f32 %r123, %r107, %r117, %p125; + selp.f32 %r124, %r108, %r114, %p124; + selp.f32 %r125, %r109, %r115, %p123; + selp.f32 %r126, %r110, %r112, %p122; + selp.f32 %r127, %r111, %r113, %p121; + cvt.u32.u64 %r128, %rd166; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + selp.b32 %r129, %r391, %r128, %p128; + selp.b32 %r130, %r392, %r128, %p127; + selp.b32 %r131, %r393, %r128, %p126; + selp.b32 %r132, %r394, %r128, %p125; + selp.b32 %r133, %r395, %r128, %p124; + selp.b32 %r134, %r396, %r128, %p123; + selp.b32 %r135, %r397, %r128, %p122; + selp.b32 %r136, %r398, %r128, %p121; +$L__tmp3: + .loc 1 43 54 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:43:54 + selp.f32 %r137, %r127, %r111, %p17; + selp.f32 %r138, %r126, %r110, %p18; + mov.b64 %rd213, {%r138, %r137}; + selp.f32 %r139, %r125, %r109, %p19; + selp.f32 %r140, %r124, %r108, %p20; + mov.b64 %rd212, {%r140, %r139}; + selp.f32 %r141, %r123, %r107, %p21; + selp.f32 %r142, %r122, %r106, %p22; + mov.b64 %rd211, {%r142, %r141}; + selp.f32 %r143, %r121, %r105, %p23; + selp.f32 %r144, %r120, %r104, %p24; + mov.b64 %rd210, {%r144, %r143}; + .loc 1 44 66 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:44:66 + selp.b32 %r398, %r136, %r398, %p17; + selp.b32 %r397, %r135, %r397, %p18; + selp.b32 %r396, %r134, %r396, %p19; + selp.b32 %r395, %r133, %r395, %p20; + selp.b32 %r394, %r132, %r394, %p21; + selp.b32 %r393, %r131, %r393, %p22; + selp.b32 %r392, %r130, %r392, %p23; + selp.b32 %r391, %r129, %r391, %p24; + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + add.s64 %rd75, %rd209, 64; + add.s64 %rd208, %rd208, 256; + add.s64 %rd207, %rd207, 256; + add.s64 %rd206, %rd206, 256; + add.s64 %rd205, %rd205, 256; + add.s64 %rd204, %rd204, 256; + add.s64 %rd203, %rd203, 256; + add.s64 %rd202, %rd202, 256; + add.s64 %rd201, %rd201, 256; + setp.lt.u64 %p129, %rd209, 31936; + mov.b64 %rd209, %rd75; + @%p129 bra $L__BB0_25; +// %bb.26: + .loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23 + or.b32 %r185, %r1, %r4; + .loc 1 24 21 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:24:21 + setp.lt.s32 %p151, %r185, %r44; + .loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44 + and.b32 %r186, %r2, 31; +$L__tmp4: + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + mov.b64 {%r187, %r188}, %rd213; + shfl.sync.bfly.b32 %r189, %r188, 16, 31, -1; + shfl.sync.bfly.b32 %r190, %r398, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p152, %r188, %r189; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p153, %r188, %r189; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + mov.b64 {%r191, %r192}, %rd210; + setp.nan.f32 %p154, %r191, %r191; + setp.nan.f32 %p155, %r192, %r192; + mov.b64 {%r193, %r194}, %rd211; + setp.nan.f32 %p156, %r193, %r193; + setp.nan.f32 %p157, %r194, %r194; + mov.b64 {%r195, %r196}, %rd212; + setp.nan.f32 %p158, %r195, %r195; + setp.nan.f32 %p159, %r196, %r196; + setp.nan.f32 %p160, %r187, %r187; + setp.nan.f32 %p161, %r188, %r188; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p162, %r189, %r189; + setp.num.f32 %p163, %r189, %r189; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p164, %p161, %p163; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p165, %p152, %p164; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p166, %p161, %p162; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p167, %p153, %p166; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p168, %r398, %r190; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p169, %p168, %p167; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p170, %p165, %p169; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r197, %r188, %r189, %p170; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r198, %r398, %r190, %p170; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r199, %r197, 8, 31, -1; + shfl.sync.bfly.b32 %r200, %r198, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p171, %r197, %r199; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p172, %r197, %r199; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p173, %r197, %r197; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p174, %r199, %r199; + setp.num.f32 %p175, %r199, %r199; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p176, %p173, %p175; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p177, %p171, %p176; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p178, %p174, %p173; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p179, %p172, %p178; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p180, %r198, %r200; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p181, %p180, %p179; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p182, %p177, %p181; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r201, %r197, %r199, %p182; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r202, %r198, %r200, %p182; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r203, %r201, 4, 31, -1; + shfl.sync.bfly.b32 %r204, %r202, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p183, %r201, %r203; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p184, %r201, %r203; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p185, %r201, %r201; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p186, %r203, %r203; + setp.num.f32 %p187, %r203, %r203; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p188, %p185, %p187; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p189, %p183, %p188; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p190, %p186, %p185; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p191, %p184, %p190; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p192, %r202, %r204; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p193, %p192, %p191; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p194, %p189, %p193; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r205, %r201, %r203, %p194; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r206, %r202, %r204, %p194; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r207, %r205, 2, 31, -1; + shfl.sync.bfly.b32 %r208, %r206, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p195, %r205, %r207; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p196, %r205, %r207; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p197, %r205, %r205; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p198, %r207, %r207; + setp.num.f32 %p199, %r207, %r207; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p200, %p197, %p199; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p201, %p195, %p200; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p202, %p198, %p197; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p203, %p196, %p202; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p204, %r206, %r208; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p205, %p204, %p203; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p206, %p201, %p205; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r209, %r205, %r207, %p206; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r210, %r206, %r208, %p206; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r211, %r209, 1, 31, -1; + shfl.sync.bfly.b32 %r212, %r210, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p207, %r209, %r211; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p208, %r209, %r211; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p209, %r209, %r209; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p210, %r211, %r211; + setp.num.f32 %p211, %r211, %r211; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p212, %p209, %p211; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p213, %p207, %p212; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p214, %p210, %p209; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p215, %p208, %p214; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p216, %r210, %r212; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p217, %p216, %p215; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p218, %p213, %p217; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r148, %r210, %r212, %p218; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r213, %r187, 16, 31, -1; + shfl.sync.bfly.b32 %r214, %r397, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p219, %r187, %r213; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p220, %r187, %r213; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p221, %r213, %r213; + setp.num.f32 %p222, %r213, %r213; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p223, %p160, %p222; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p224, %p219, %p223; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p225, %p160, %p221; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p226, %p220, %p225; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p227, %r397, %r214; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p228, %p227, %p226; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p229, %p224, %p228; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r215, %r187, %r213, %p229; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r216, %r397, %r214, %p229; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r217, %r215, 8, 31, -1; + shfl.sync.bfly.b32 %r218, %r216, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p230, %r215, %r217; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p231, %r215, %r217; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p232, %r215, %r215; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p233, %r217, %r217; + setp.num.f32 %p234, %r217, %r217; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p235, %p232, %p234; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p236, %p230, %p235; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p237, %p233, %p232; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p238, %p231, %p237; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p239, %r216, %r218; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p240, %p239, %p238; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p241, %p236, %p240; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r219, %r215, %r217, %p241; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r220, %r216, %r218, %p241; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r221, %r219, 4, 31, -1; + shfl.sync.bfly.b32 %r222, %r220, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p242, %r219, %r221; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p243, %r219, %r221; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p244, %r219, %r219; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p245, %r221, %r221; + setp.num.f32 %p246, %r221, %r221; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p247, %p244, %p246; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p248, %p242, %p247; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p249, %p245, %p244; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p250, %p243, %p249; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p251, %r220, %r222; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p252, %p251, %p250; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p253, %p248, %p252; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r223, %r219, %r221, %p253; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r224, %r220, %r222, %p253; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r225, %r223, 2, 31, -1; + shfl.sync.bfly.b32 %r226, %r224, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p254, %r223, %r225; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p255, %r223, %r225; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p256, %r223, %r223; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p257, %r225, %r225; + setp.num.f32 %p258, %r225, %r225; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p259, %p256, %p258; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p260, %p254, %p259; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p261, %p257, %p256; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p262, %p255, %p261; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p263, %r224, %r226; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p264, %p263, %p262; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p265, %p260, %p264; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r227, %r223, %r225, %p265; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r228, %r224, %r226, %p265; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r229, %r227, 1, 31, -1; + shfl.sync.bfly.b32 %r230, %r228, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p266, %r227, %r229; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p267, %r227, %r229; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p268, %r227, %r227; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p269, %r229, %r229; + setp.num.f32 %p270, %r229, %r229; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p271, %p268, %p270; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p272, %p266, %p271; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p273, %p269, %p268; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p274, %p267, %p273; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p275, %r228, %r230; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p276, %p275, %p274; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p277, %p272, %p276; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r152, %r228, %r230, %p277; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r231, %r196, 16, 31, -1; + shfl.sync.bfly.b32 %r232, %r396, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p278, %r196, %r231; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p279, %r196, %r231; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p280, %r231, %r231; + setp.num.f32 %p281, %r231, %r231; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p282, %p159, %p281; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p283, %p278, %p282; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p284, %p159, %p280; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p285, %p279, %p284; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p286, %r396, %r232; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p287, %p286, %p285; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p288, %p283, %p287; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r233, %r196, %r231, %p288; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r234, %r396, %r232, %p288; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r235, %r233, 8, 31, -1; + shfl.sync.bfly.b32 %r236, %r234, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p289, %r233, %r235; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p290, %r233, %r235; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p291, %r233, %r233; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p292, %r235, %r235; + setp.num.f32 %p293, %r235, %r235; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p294, %p291, %p293; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p295, %p289, %p294; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p296, %p292, %p291; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p297, %p290, %p296; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p298, %r234, %r236; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p299, %p298, %p297; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p300, %p295, %p299; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r237, %r233, %r235, %p300; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r238, %r234, %r236, %p300; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r239, %r237, 4, 31, -1; + shfl.sync.bfly.b32 %r240, %r238, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p301, %r237, %r239; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p302, %r237, %r239; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p303, %r237, %r237; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p304, %r239, %r239; + setp.num.f32 %p305, %r239, %r239; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p306, %p303, %p305; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p307, %p301, %p306; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p308, %p304, %p303; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p309, %p302, %p308; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p310, %r238, %r240; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p311, %p310, %p309; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p312, %p307, %p311; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r241, %r237, %r239, %p312; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r242, %r238, %r240, %p312; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r243, %r241, 2, 31, -1; + shfl.sync.bfly.b32 %r244, %r242, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p313, %r241, %r243; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p314, %r241, %r243; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p315, %r241, %r241; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p316, %r243, %r243; + setp.num.f32 %p317, %r243, %r243; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p318, %p315, %p317; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p319, %p313, %p318; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p320, %p316, %p315; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p321, %p314, %p320; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p322, %r242, %r244; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p323, %p322, %p321; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p324, %p319, %p323; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r245, %r241, %r243, %p324; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r246, %r242, %r244, %p324; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r247, %r245, 1, 31, -1; + shfl.sync.bfly.b32 %r248, %r246, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p325, %r245, %r247; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p326, %r245, %r247; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p327, %r245, %r245; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p328, %r247, %r247; + setp.num.f32 %p329, %r247, %r247; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p330, %p327, %p329; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p331, %p325, %p330; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p332, %p328, %p327; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p333, %p326, %p332; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p334, %r246, %r248; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p335, %p334, %p333; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p336, %p331, %p335; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r156, %r246, %r248, %p336; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r249, %r195, 16, 31, -1; + shfl.sync.bfly.b32 %r250, %r395, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p337, %r195, %r249; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p338, %r195, %r249; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p339, %r249, %r249; + setp.num.f32 %p340, %r249, %r249; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p341, %p158, %p340; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p342, %p337, %p341; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p343, %p158, %p339; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p344, %p338, %p343; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p345, %r395, %r250; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p346, %p345, %p344; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p347, %p342, %p346; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r251, %r195, %r249, %p347; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r252, %r395, %r250, %p347; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r253, %r251, 8, 31, -1; + shfl.sync.bfly.b32 %r254, %r252, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p348, %r251, %r253; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p349, %r251, %r253; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p350, %r251, %r251; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p351, %r253, %r253; + setp.num.f32 %p352, %r253, %r253; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p353, %p350, %p352; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p354, %p348, %p353; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p355, %p351, %p350; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p356, %p349, %p355; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p357, %r252, %r254; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p358, %p357, %p356; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p359, %p354, %p358; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r255, %r251, %r253, %p359; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r256, %r252, %r254, %p359; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r257, %r255, 4, 31, -1; + shfl.sync.bfly.b32 %r258, %r256, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p360, %r255, %r257; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p361, %r255, %r257; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p362, %r255, %r255; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p363, %r257, %r257; + setp.num.f32 %p364, %r257, %r257; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p365, %p362, %p364; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p366, %p360, %p365; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p367, %p363, %p362; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p368, %p361, %p367; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p369, %r256, %r258; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p370, %p369, %p368; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p371, %p366, %p370; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r259, %r255, %r257, %p371; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r260, %r256, %r258, %p371; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r261, %r259, 2, 31, -1; + shfl.sync.bfly.b32 %r262, %r260, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p372, %r259, %r261; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p373, %r259, %r261; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p374, %r259, %r259; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p375, %r261, %r261; + setp.num.f32 %p376, %r261, %r261; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p377, %p374, %p376; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p378, %p372, %p377; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p379, %p375, %p374; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p380, %p373, %p379; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p381, %r260, %r262; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p382, %p381, %p380; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p383, %p378, %p382; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r263, %r259, %r261, %p383; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r264, %r260, %r262, %p383; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r265, %r263, 1, 31, -1; + shfl.sync.bfly.b32 %r266, %r264, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p384, %r263, %r265; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p385, %r263, %r265; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p386, %r263, %r263; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p387, %r265, %r265; + setp.num.f32 %p388, %r265, %r265; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p389, %p386, %p388; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p390, %p384, %p389; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p391, %p387, %p386; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p392, %p385, %p391; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p393, %r264, %r266; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p394, %p393, %p392; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p395, %p390, %p394; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r160, %r264, %r266, %p395; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r267, %r194, 16, 31, -1; + shfl.sync.bfly.b32 %r268, %r394, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p396, %r194, %r267; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p397, %r194, %r267; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p398, %r267, %r267; + setp.num.f32 %p399, %r267, %r267; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p400, %p157, %p399; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p401, %p396, %p400; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p402, %p157, %p398; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p403, %p397, %p402; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p404, %r394, %r268; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p405, %p404, %p403; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p406, %p401, %p405; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r269, %r194, %r267, %p406; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r270, %r394, %r268, %p406; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r271, %r269, 8, 31, -1; + shfl.sync.bfly.b32 %r272, %r270, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p407, %r269, %r271; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p408, %r269, %r271; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p409, %r269, %r269; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p410, %r271, %r271; + setp.num.f32 %p411, %r271, %r271; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p412, %p409, %p411; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p413, %p407, %p412; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p414, %p410, %p409; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p415, %p408, %p414; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p416, %r270, %r272; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p417, %p416, %p415; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p418, %p413, %p417; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r273, %r269, %r271, %p418; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r274, %r270, %r272, %p418; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r275, %r273, 4, 31, -1; + shfl.sync.bfly.b32 %r276, %r274, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p419, %r273, %r275; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p420, %r273, %r275; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p421, %r273, %r273; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p422, %r275, %r275; + setp.num.f32 %p423, %r275, %r275; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p424, %p421, %p423; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p425, %p419, %p424; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p426, %p422, %p421; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p427, %p420, %p426; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p428, %r274, %r276; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p429, %p428, %p427; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p430, %p425, %p429; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r277, %r273, %r275, %p430; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r278, %r274, %r276, %p430; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r279, %r277, 2, 31, -1; + shfl.sync.bfly.b32 %r280, %r278, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p431, %r277, %r279; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p432, %r277, %r279; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p433, %r277, %r277; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p434, %r279, %r279; + setp.num.f32 %p435, %r279, %r279; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p436, %p433, %p435; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p437, %p431, %p436; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p438, %p434, %p433; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p439, %p432, %p438; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p440, %r278, %r280; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p441, %p440, %p439; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p442, %p437, %p441; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r281, %r277, %r279, %p442; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r282, %r278, %r280, %p442; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r283, %r281, 1, 31, -1; + shfl.sync.bfly.b32 %r284, %r282, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p443, %r281, %r283; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p444, %r281, %r283; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p445, %r281, %r281; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p446, %r283, %r283; + setp.num.f32 %p447, %r283, %r283; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p448, %p445, %p447; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p449, %p443, %p448; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p450, %p446, %p445; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p451, %p444, %p450; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p452, %r282, %r284; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p453, %p452, %p451; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p454, %p449, %p453; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r164, %r282, %r284, %p454; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r285, %r193, 16, 31, -1; + shfl.sync.bfly.b32 %r286, %r393, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p455, %r193, %r285; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p456, %r193, %r285; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p457, %r285, %r285; + setp.num.f32 %p458, %r285, %r285; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p459, %p156, %p458; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p460, %p455, %p459; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p461, %p156, %p457; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p462, %p456, %p461; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p463, %r393, %r286; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p464, %p463, %p462; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p465, %p460, %p464; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r287, %r193, %r285, %p465; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r288, %r393, %r286, %p465; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r289, %r287, 8, 31, -1; + shfl.sync.bfly.b32 %r290, %r288, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p466, %r287, %r289; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p467, %r287, %r289; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p468, %r287, %r287; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p469, %r289, %r289; + setp.num.f32 %p470, %r289, %r289; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p471, %p468, %p470; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p472, %p466, %p471; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p473, %p469, %p468; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p474, %p467, %p473; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p475, %r288, %r290; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p476, %p475, %p474; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p477, %p472, %p476; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r291, %r287, %r289, %p477; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r292, %r288, %r290, %p477; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r293, %r291, 4, 31, -1; + shfl.sync.bfly.b32 %r294, %r292, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p478, %r291, %r293; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p479, %r291, %r293; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p480, %r291, %r291; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p481, %r293, %r293; + setp.num.f32 %p482, %r293, %r293; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p483, %p480, %p482; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p484, %p478, %p483; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p485, %p481, %p480; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p486, %p479, %p485; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p487, %r292, %r294; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p488, %p487, %p486; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p489, %p484, %p488; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r295, %r291, %r293, %p489; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r296, %r292, %r294, %p489; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r297, %r295, 2, 31, -1; + shfl.sync.bfly.b32 %r298, %r296, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p490, %r295, %r297; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p491, %r295, %r297; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p492, %r295, %r295; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p493, %r297, %r297; + setp.num.f32 %p494, %r297, %r297; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p495, %p492, %p494; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p496, %p490, %p495; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p497, %p493, %p492; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p498, %p491, %p497; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p499, %r296, %r298; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p500, %p499, %p498; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p501, %p496, %p500; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r299, %r295, %r297, %p501; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r300, %r296, %r298, %p501; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r301, %r299, 1, 31, -1; + shfl.sync.bfly.b32 %r302, %r300, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p502, %r299, %r301; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p503, %r299, %r301; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p504, %r299, %r299; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p505, %r301, %r301; + setp.num.f32 %p506, %r301, %r301; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p507, %p504, %p506; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p508, %p502, %p507; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p509, %p505, %p504; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p510, %p503, %p509; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p511, %r300, %r302; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p512, %p511, %p510; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p513, %p508, %p512; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r168, %r300, %r302, %p513; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r303, %r192, 16, 31, -1; + shfl.sync.bfly.b32 %r304, %r392, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p514, %r192, %r303; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p515, %r192, %r303; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p516, %r303, %r303; + setp.num.f32 %p517, %r303, %r303; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p518, %p155, %p517; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p519, %p514, %p518; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p520, %p155, %p516; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p521, %p515, %p520; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p522, %r392, %r304; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p523, %p522, %p521; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p524, %p519, %p523; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r305, %r192, %r303, %p524; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r306, %r392, %r304, %p524; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r307, %r305, 8, 31, -1; + shfl.sync.bfly.b32 %r308, %r306, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p525, %r305, %r307; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p526, %r305, %r307; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p527, %r305, %r305; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p528, %r307, %r307; + setp.num.f32 %p529, %r307, %r307; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p530, %p527, %p529; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p531, %p525, %p530; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p532, %p528, %p527; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p533, %p526, %p532; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p534, %r306, %r308; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p535, %p534, %p533; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p536, %p531, %p535; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r309, %r305, %r307, %p536; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r310, %r306, %r308, %p536; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r311, %r309, 4, 31, -1; + shfl.sync.bfly.b32 %r312, %r310, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p537, %r309, %r311; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p538, %r309, %r311; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p539, %r309, %r309; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p540, %r311, %r311; + setp.num.f32 %p541, %r311, %r311; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p542, %p539, %p541; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p543, %p537, %p542; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p544, %p540, %p539; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p545, %p538, %p544; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p546, %r310, %r312; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p547, %p546, %p545; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p548, %p543, %p547; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r313, %r309, %r311, %p548; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r314, %r310, %r312, %p548; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r315, %r313, 2, 31, -1; + shfl.sync.bfly.b32 %r316, %r314, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p549, %r313, %r315; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p550, %r313, %r315; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p551, %r313, %r313; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p552, %r315, %r315; + setp.num.f32 %p553, %r315, %r315; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p554, %p551, %p553; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p555, %p549, %p554; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p556, %p552, %p551; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p557, %p550, %p556; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p558, %r314, %r316; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p559, %p558, %p557; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p560, %p555, %p559; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r317, %r313, %r315, %p560; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r318, %r314, %r316, %p560; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r319, %r317, 1, 31, -1; + shfl.sync.bfly.b32 %r320, %r318, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p561, %r317, %r319; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p562, %r317, %r319; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p563, %r317, %r317; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p564, %r319, %r319; + setp.num.f32 %p565, %r319, %r319; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p566, %p563, %p565; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p567, %p561, %p566; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p568, %p564, %p563; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p569, %p562, %p568; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p570, %r318, %r320; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p571, %p570, %p569; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p572, %p567, %p571; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r172, %r318, %r320, %p572; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r321, %r191, 16, 31, -1; + shfl.sync.bfly.b32 %r322, %r391, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p573, %r191, %r321; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p574, %r191, %r321; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p575, %r321, %r321; + setp.num.f32 %p576, %r321, %r321; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p577, %p154, %p576; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p578, %p573, %p577; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p579, %p154, %p575; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p580, %p574, %p579; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p581, %r391, %r322; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p582, %p581, %p580; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p583, %p578, %p582; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r323, %r191, %r321, %p583; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r324, %r391, %r322, %p583; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r325, %r323, 8, 31, -1; + shfl.sync.bfly.b32 %r326, %r324, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p584, %r323, %r325; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p585, %r323, %r325; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p586, %r323, %r323; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p587, %r325, %r325; + setp.num.f32 %p588, %r325, %r325; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p589, %p586, %p588; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p590, %p584, %p589; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p591, %p587, %p586; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p592, %p585, %p591; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p593, %r324, %r326; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p594, %p593, %p592; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p595, %p590, %p594; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r327, %r323, %r325, %p595; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r328, %r324, %r326, %p595; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r329, %r327, 4, 31, -1; + shfl.sync.bfly.b32 %r330, %r328, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p596, %r327, %r329; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p597, %r327, %r329; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p598, %r327, %r327; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p599, %r329, %r329; + setp.num.f32 %p600, %r329, %r329; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p601, %p598, %p600; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p602, %p596, %p601; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p603, %p599, %p598; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p604, %p597, %p603; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p605, %r328, %r330; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p606, %p605, %p604; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p607, %p602, %p606; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r331, %r327, %r329, %p607; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r332, %r328, %r330, %p607; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r333, %r331, 2, 31, -1; + shfl.sync.bfly.b32 %r334, %r332, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p608, %r331, %r333; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p609, %r331, %r333; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p610, %r331, %r331; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p611, %r333, %r333; + setp.num.f32 %p612, %r333, %r333; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p613, %p610, %p612; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p614, %p608, %p613; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p615, %p611, %p610; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p616, %p609, %p615; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p617, %r332, %r334; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p618, %p617, %p616; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p619, %p614, %p618; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r335, %r331, %r333, %p619; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r336, %r332, %r334, %p619; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r337, %r335, 1, 31, -1; + shfl.sync.bfly.b32 %r338, %r336, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p620, %r335, %r337; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p621, %r335, %r337; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p622, %r335, %r335; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p623, %r337, %r337; + setp.num.f32 %p624, %r337, %r337; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p625, %p622, %p624; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p626, %p620, %p625; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p627, %p623, %p622; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p628, %p621, %p627; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p629, %r336, %r338; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p630, %p629, %p628; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p631, %p626, %p630; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r176, %r336, %r338, %p631; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + bfe.u32 %r339, %r2, 5, 1; + setp.eq.b32 %p130, %r186, 0; + shr.u32 %r340, %r3, 5; + or.b32 %r341, %r340, %r339; + shl.b32 %r342, %r341, 2; + mov.b32 %r343, global_smem; + add.s32 %r145, %r343, %r342; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r146, %r209, %r211, %p218; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r145 + 0 ], %r146; + // end inline asm + add.s32 %r344, %r343, 512; + add.s32 %r147, %r344, %r342; + // begin inline asm + @%p130 st.shared.b32 [ %r147 + 0 ], %r148; + // end inline asm + shl.b32 %r345, %r339, 2; + shl.b32 %r346, %r5, 3; + or.b32 %r347, %r346, %r345; + add.s32 %r149, %r343, %r347; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r150, %r227, %r229, %p277; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r149 + 0 ], %r150; + // end inline asm + add.s32 %r151, %r344, %r347; + // begin inline asm + @%p130 st.shared.b32 [ %r151 + 0 ], %r152; + // end inline asm + shl.b32 %r348, %r6, 3; + or.b32 %r349, %r348, %r345; + add.s32 %r153, %r343, %r349; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r154, %r245, %r247, %p336; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r153 + 0 ], %r154; + // end inline asm + add.s32 %r155, %r344, %r349; + // begin inline asm + @%p130 st.shared.b32 [ %r155 + 0 ], %r156; + // end inline asm + shl.b32 %r350, %r7, 3; + or.b32 %r351, %r350, %r345; + add.s32 %r157, %r343, %r351; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r158, %r263, %r265, %p395; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r157 + 0 ], %r158; + // end inline asm + add.s32 %r159, %r344, %r351; + // begin inline asm + @%p130 st.shared.b32 [ %r159 + 0 ], %r160; + // end inline asm + shl.b32 %r352, %r11, 3; + or.b32 %r353, %r352, %r345; + add.s32 %r161, %r343, %r353; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r162, %r281, %r283, %p454; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r161 + 0 ], %r162; + // end inline asm + add.s32 %r163, %r344, %r353; + // begin inline asm + @%p130 st.shared.b32 [ %r163 + 0 ], %r164; + // end inline asm + shl.b32 %r354, %r10, 3; + or.b32 %r355, %r354, %r345; + add.s32 %r165, %r343, %r355; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r166, %r299, %r301, %p513; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r165 + 0 ], %r166; + // end inline asm + add.s32 %r167, %r344, %r355; + // begin inline asm + @%p130 st.shared.b32 [ %r167 + 0 ], %r168; + // end inline asm + shl.b32 %r356, %r9, 3; + or.b32 %r357, %r356, %r345; + add.s32 %r169, %r343, %r357; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r170, %r317, %r319, %p572; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r169 + 0 ], %r170; + // end inline asm + add.s32 %r171, %r344, %r357; + // begin inline asm + @%p130 st.shared.b32 [ %r171 + 0 ], %r172; + // end inline asm + shl.b32 %r358, %r8, 3; + or.b32 %r359, %r358, %r345; + add.s32 %r173, %r343, %r359; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r174, %r335, %r337, %p631; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p130 st.shared.b32 [ %r173 + 0 ], %r174; + // end inline asm + add.s32 %r175, %r344, %r359; + // begin inline asm + @%p130 st.shared.b32 [ %r175 + 0 ], %r176; + // end inline asm + bar.sync 0; + setp.lt.u32 %p146, %r2, 128; + shl.b32 %r360, %r2, 2; + add.s32 %r178, %r343, %r360; + // begin inline asm + @%p146 ld.shared.b32 %r177, [ %r178 + 0 ]; + // end inline asm + add.s32 %r180, %r344, %r360; + // begin inline asm + @%p146 ld.shared.b32 %r179, [ %r180 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r361, %r177, 1, 31, -1; + shfl.sync.bfly.b32 %r362, %r179, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p632, %r177, %r361; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p633, %r177, %r361; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p634, %r177, %r177; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p635, %r361, %r361; + setp.num.f32 %p636, %r361, %r361; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p637, %p634, %p636; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p638, %p632, %p637; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p639, %p634, %p635; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p640, %p633, %p639; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p641, %r179, %r362; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p642, %p641, %p640; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p643, %p638, %p642; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r184, %r179, %r362, %p643; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.b32 %r363, %r2, 897; + setp.eq.b32 %p148, %r363, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r182, %r177, %r361, %p643; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p148 st.shared.b32 [ %r178 + 0 ], %r182; + // end inline asm + // begin inline asm + @%p148 st.shared.b32 [ %r180 + 0 ], %r184; + // end inline asm + bar.sync 0; + shr.u32 %r364, %r3, 3; + add.s32 %r365, %r344, %r364; + ld.shared.b32 %r366, [%r365]; + add.s32 %r367, %r344, %r346; + ld.shared.b32 %r368, [%r367]; + add.s32 %r369, %r344, %r348; + ld.shared.b32 %r370, [%r369]; + add.s32 %r371, %r344, %r350; + ld.shared.b32 %r372, [%r371]; + add.s32 %r373, %r344, %r352; + ld.shared.b32 %r374, [%r373]; + add.s32 %r375, %r344, %r354; + ld.shared.b32 %r376, [%r375]; + add.s32 %r377, %r344, %r356; + ld.shared.b32 %r378, [%r377]; + add.s32 %r379, %r344, %r358; + ld.shared.b32 %r380, [%r379]; +$L__tmp5: + .loc 1 47 25 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:25 + mad.wide.s32 %rd192, %r185, 8, %rd85; + .loc 1 47 36 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:36 + bar.sync 0; + shr.u32 %r381, %r3, 2; + add.s32 %r382, %r343, %r381; + st.shared.v4.b32 [%r382], {%r366, %r368, %r370, %r372}; + st.shared.v4.b32 [%r382+128], {%r374, %r376, %r378, %r380}; + bar.sync 0; + shl.b32 %r383, %r2, 4; + and.b32 %r384, %r383, 112; + shr.u32 %r385, %r2, 1; + and.b32 %r386, %r385, 12; + and.b32 %r387, %r360, 128; + add.s32 %r388, %r343, %r384; + add.s32 %r389, %r388, %r387; + add.s32 %r390, %r389, %r386; + ld.shared.s32 %rd191, [%r390]; + setp.eq.b32 %p644, %r3, 0; + and.pred %p150, %p644, %p151; + // begin inline asm + @%p150 st.global.b64 [ %rd192 + 0 ], { %rd191 }; + // end inline asm + .loc 1 47 4 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:4 + ret; +$L__tmp6: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 119 +.b8 100 +.b8 104 +.b8 119 +.b8 108 +.b8 117 +.b8 54 +.b8 121 +.b8 98 +.b8 51 +.b8 119 +.b8 99 +.b8 119 +.b8 97 +.b8 122 +.b8 100 +.b8 110 +.b8 122 +.b8 109 +.b8 103 +.b8 122 +.b8 101 +.b8 119 +.b8 105 +.b8 101 +.b8 109 +.b8 118 +.b8 122 +.b8 110 +.b8 120 +.b8 118 +.b8 114 +.b8 114 +.b8 51 +.b8 53 +.b8 50 +.b8 53 +.b8 101 +.b8 111 +.b8 106 +.b8 117 +.b8 112 +.b8 113 +.b8 106 +.b8 108 +.b8 100 +.b8 111 +.b8 53 +.b8 112 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp0 // DW_AT_low_pc +.b64 $L__tmp3 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..34de811fbbc5d6835b2b600008f5a1c0bb0586dd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source @@ -0,0 +1,323 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 64 : i32 loc(#loc80) + %xoffset_2 = arith.constant 64 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<64x64xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<64x64xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c64_i32 = arith.constant 64 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<64x64xf32>, tensor<64x64xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x64xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x64xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x64xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<64x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_23 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc95) + %tmp0_25 = tt.broadcast %tmp0_22 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc95) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<64x64xi64> loc(#loc95) + %tmp0_27 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96) + %tmp0_28 = arith.muli %tmp0_27, %x1_12 : tensor<64x1xi64> loc(#loc96) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc97) + %tmp0_30 = arith.addi %tmp0_26, %tmp0_29 : tensor<64x64xi64> loc(#loc97) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc98) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> loc(#loc98) + %tmp0_33 = tt.broadcast %r0_mask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc99) + %tmp0_34 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc99) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<64x64xi1> loc(#loc99) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc100) + %tmp0_38 = tt.load %tmp0_32, %tmp0_35, %tmp0_37 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_64S_i32S64_64S_fp32S64_64S_i32S1_64S__(%_tmp2_16, %_tmp2_index_17, %tmp0_38, %r0_index_18) : (tensor<64x64xf32>, tensor<64x64xi32>, tensor<64x64xf32>, tensor<1x64xi32>) -> (tensor<64x64xf32>, tensor<64x64xi32>) loc(#loc24) + %_tmp2_39 = tt.broadcast %r0_mask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc101) + %_tmp2_40 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc101) + %_tmp2_41 = arith.andi %_tmp2_39, %_tmp2_40 : tensor<64x64xi1> loc(#loc101) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_16 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc102) + %_tmp2_index_43 = tt.broadcast %r0_mask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc103) + %_tmp2_index_44 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc103) + %_tmp2_index_45 = arith.andi %_tmp2_index_43, %_tmp2_index_44 : tensor<64x64xi1> loc(#loc103) + %_tmp2_index_46 = arith.select %_tmp2_index_45, %8#1, %_tmp2_index_17 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc104) + scf.yield %_tmp2_42, %_tmp2_index_46 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_64S_i32S64_64S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_64S_i32S64_64S_fp32S64_64S_i32S1_64S__(%a_value: tensor<64x64xf32> loc("a_value"(#loc35)), %a_index: tensor<64x64xi32> loc("a_index"(#loc35)), %b_value: tensor<64x64xf32> loc("b_value"(#loc35)), %b_index: tensor<1x64xi32> loc("b_index"(#loc35))) -> (tensor<64x64xf32>, tensor<64x64xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<64x64xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<64x64xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_64S__(%a_value) : (tensor<64x64xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<64x64xi1>, tensor<64x64xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<64x64xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<64x64xf32> loc(#loc113) + %mask_4 = arith.constant true loc(#loc114) + %mask_5 = arith.constant dense : tensor<64x64xi1> loc(#loc114) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<64x64xi1> loc(#loc114) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<64x64xi1> loc(#loc115) + %mask_8 = arith.ori %mask, %mask_7 : tensor<64x64xi1> loc(#loc129) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1> loc(#loc117) + %equal_10 = arith.ori %equal, %equal_9 : tensor<64x64xi1> loc(#loc130) + scf.yield %mask_8, %equal_10 : tensor<64x64xi1>, tensor<64x64xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<64x64xi1>, tensor<64x64xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = tt.broadcast %b_index : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc119) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<64x64xi32> loc(#loc119) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<64x64xi1> loc(#loc120) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<64x64xi1> loc(#loc121) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc51) + %3 = tt.broadcast %b_index : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc52) + %4 = arith.select %mask_3, %a_index, %3 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc52) + tt.return %2, %4 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc53) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x64xf32> loc(#loc54) + %6 = ub.poison : tensor<64x64xi32> loc(#loc54) + tt.return %5, %6 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_64S__(%x: tensor<64x64xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_64S__(%x) : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_64S__(%x: tensor<64x64xf32> loc("x"(#loc59))) -> tensor<64x64xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<64x64xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<64x64xf32> loc(#loc61) + tt.return %4 : tensor<64x64xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x64xf32> loc(#loc63) + tt.return %5 : tensor<64x64xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_64S_i32S64_64S__(2,)cconstexpr_1_"(%value: tensor<64x64xf32> loc("value"(#loc68)), %index: tensor<64x64xi32> loc("index"(#loc68))) -> (tensor<64xf32>, tensor<64xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<64xf32>, tensor<64xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc71) + %2 = ub.poison : tensor<64xi32> loc(#loc71) + tt.return %1, %2 : tensor<64xf32>, tensor<64xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..3089fd8704bd2acf06bd60d81ee5e4dfb930af80 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,217 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 2], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc1 = loc(unknown) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc85 = loc(callsite(#loc1 at #loc39)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %cst_1 = arith.constant dense : tensor<64x64xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2147483647> : tensor<64x64xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<64x64xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc50) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc51) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc52) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc52) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc52) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc52) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc53) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc53) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc53) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc53) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc54) + %xmask_13 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc54) + %xmask_14 = arith.cmpi slt, %xindex_11, %xmask : tensor<64x1xi32, #blocked> loc(#loc54) + %xmask_15 = arith.cmpi slt, %xindex_12, %xmask_13 : tensor<64x1xi32, #blocked1> loc(#loc54) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc55) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc55) + %x0 = arith.extsi %xindex_11 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc56) + %x0_17 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc56) + %x0_18 = arith.remsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc56) + %x1 = arith.divsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc57) + %tmp0 = arith.muli %x0_18, %cst : tensor<64x1xi64, #blocked> loc(#loc58) + %tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc59) + %tmp0_20 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc60) + %tmp0_21 = arith.muli %tmp0_20, %x1 : tensor<64x1xi64, #blocked> loc(#loc60) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc61) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc62) + %tmp0_24 = tt.broadcast %xmask_14 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc63) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c64_i32 iter_args(%_tmp2 = %cst_4, %_tmp2_index_25 = %cst_3) -> (tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc65) + %r0_index_26 = arith.addi %r0_index, %r0_base_16 : tensor<1x64xi32, #blocked> loc(#loc65) + %r0_mask = arith.cmpi slt, %r0_index_26, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc66) + %tmp0_27 = arith.extsi %r0_index_26 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc59) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc59) + %tmp0_29 = arith.addi %tmp0_28, %tmp0_19 : tensor<64x64xi64, #blocked> loc(#loc59) + %tmp0_30 = arith.addi %tmp0_29, %tmp0_22 : tensor<64x64xi64, #blocked> loc(#loc61) + %tmp0_31 = tt.addptr %tmp0_23, %tmp0_30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> loc(#loc62) + %tmp0_32 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc63) + %tmp0_33 = arith.andi %tmp0_32, %tmp0_24 : tensor<64x64xi1, #blocked> loc(#loc63) + %tmp0_34 = tt.load %tmp0_31, %tmp0_33, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc67) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_34 : tensor<64x64xf32, #blocked> loc(#loc110) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_34 : tensor<64x64xf32, #blocked> loc(#loc111) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<64x64xf32, #blocked> loc(#loc90) + %b_isnan = arith.cmpf une, %tmp0_34, %tmp0_34 : tensor<64x64xf32, #blocked> loc(#loc91) + %mask_35 = arith.xori %b_isnan, %cst_1 : tensor<64x64xi1, #blocked> loc(#loc92) + %mask_36 = arith.andi %a_isnan, %mask_35 : tensor<64x64xi1, #blocked> loc(#loc93) + %mask_37 = arith.ori %mask, %mask_36 : tensor<64x64xi1, #blocked> loc(#loc112) + %equal_38 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1, #blocked> loc(#loc95) + %equal_39 = arith.ori %equal, %equal_38 : tensor<64x64xi1, #blocked> loc(#loc113) + %mask_40 = tt.broadcast %r0_index_26 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc97) + %mask_41 = arith.cmpi slt, %_tmp2_index_25, %mask_40 : tensor<64x64xi32, #blocked> loc(#loc97) + %mask_42 = arith.andi %equal_39, %mask_41 : tensor<64x64xi1, #blocked> loc(#loc98) + %mask_43 = arith.ori %mask_37, %mask_42 : tensor<64x64xi1, #blocked> loc(#loc99) + %5 = arith.select %mask_43, %_tmp2, %tmp0_34 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc80) + %6 = arith.select %mask_43, %_tmp2_index_25, %mask_40 : tensor<64x64xi1, #blocked>, tensor<64x64xi32, #blocked> loc(#loc81) + %_tmp2_44 = arith.select %tmp0_33, %5, %_tmp2 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc82) + %_tmp2_index_45 = arith.select %tmp0_33, %6, %_tmp2_index_25 : tensor<64x64xi1, #blocked>, tensor<64x64xi32, #blocked> loc(#loc83) + scf.yield %_tmp2_44, %_tmp2_index_45 : tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked> loc(#loc37) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc39)), %arg7: i32 loc(callsite(#loc1 at #loc39)), %arg8: f32 loc(callsite(#loc1 at #loc39)), %arg9: i32 loc(callsite(#loc1 at #loc39))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc114) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc115) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc100) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc101) + %mask_25 = arith.xori %b_isnan, %true : i1 loc(#loc102) + %mask_26 = arith.andi %a_isnan, %mask_25 : i1 loc(#loc103) + %mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc116) + %equal_28 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc104) + %equal_29 = arith.ori %equal, %equal_28 : i1 loc(#loc117) + %mask_30 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc105) + %mask_31 = arith.andi %equal_29, %mask_30 : i1 loc(#loc106) + %mask_32 = arith.ori %mask_27, %mask_31 : i1 loc(#loc107) + %5 = arith.select %mask_32, %arg6, %arg8 : f32 loc(#loc108) + %6 = arith.select %mask_32, %arg7, %arg9 : i32 loc(#loc109) + tt.reduce.return %5, %6 : f32, i32 loc(#loc84) + }) : (tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked>) -> (tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc84) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc86) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc41) + %2 = tt.addptr %1, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc41) + %3 = ttg.convert_layout %tmp2 : tensor<64x1xi32, #blocked> -> tensor<64x1xi32, #blocked1> loc(#loc42) + %4 = arith.extsi %3 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc42) + tt.store %2, %4, %xmask_15 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc50 = loc("xoffset"(#loc2)) +#loc51 = loc("xoffset"(#loc3)) +#loc52 = loc("xindex"(#loc4)) +#loc53 = loc("xindex"(#loc5)) +#loc54 = loc("xmask"(#loc6)) +#loc55 = loc("r0_base"(#loc7)) +#loc56 = loc("x0"(#loc8)) +#loc57 = loc("x1"(#loc9)) +#loc58 = loc("tmp0"(#loc10)) +#loc59 = loc("tmp0"(#loc11)) +#loc60 = loc("tmp0"(#loc12)) +#loc61 = loc("tmp0"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("_tmp2"(#loc16)) +#loc65 = loc("r0_index"(#loc17)) +#loc66 = loc("r0_mask"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("mask"(#loc20)) +#loc69 = loc("equal"(#loc22)) +#loc70 = loc("a_isnan"(#loc23)) +#loc71 = loc("b_isnan"(#loc24)) +#loc72 = loc("mask"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("equal"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc(callsite(#loc33 at #loc21)) +#loc81 = loc(callsite(#loc34 at #loc21)) +#loc82 = loc("_tmp2"(#loc35)) +#loc83 = loc("_tmp2_index"(#loc36)) +#loc84 = loc(callsite(#loc38 at #loc39)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc64)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc("equal"(#loc69)) +#loc90 = loc(callsite(#loc70 at #loc21)) +#loc91 = loc(callsite(#loc71 at #loc21)) +#loc92 = loc(callsite(#loc72 at #loc21)) +#loc93 = loc(callsite(#loc73 at #loc21)) +#loc94 = loc("mask"(#loc74)) +#loc95 = loc(callsite(#loc75 at #loc21)) +#loc96 = loc("equal"(#loc76)) +#loc97 = loc(callsite(#loc77 at #loc21)) +#loc98 = loc(callsite(#loc78 at #loc21)) +#loc99 = loc(callsite(#loc79 at #loc21)) +#loc100 = loc(callsite(#loc70 at #loc84)) +#loc101 = loc(callsite(#loc71 at #loc84)) +#loc102 = loc(callsite(#loc72 at #loc84)) +#loc103 = loc(callsite(#loc73 at #loc84)) +#loc104 = loc(callsite(#loc75 at #loc84)) +#loc105 = loc(callsite(#loc77 at #loc84)) +#loc106 = loc(callsite(#loc78 at #loc84)) +#loc107 = loc(callsite(#loc79 at #loc84)) +#loc108 = loc(callsite(#loc33 at #loc84)) +#loc109 = loc(callsite(#loc34 at #loc84)) +#loc110 = loc(callsite(#loc88 at #loc21)) +#loc111 = loc(callsite(#loc89 at #loc21)) +#loc112 = loc(callsite(#loc94 at #loc21)) +#loc113 = loc(callsite(#loc96 at #loc21)) +#loc114 = loc(callsite(#loc88 at #loc84)) +#loc115 = loc(callsite(#loc89 at #loc84)) +#loc116 = loc(callsite(#loc94 at #loc84)) +#loc117 = loc(callsite(#loc96 at #loc84)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2dac5e9d3001cb78b652fbaf9b814377bc652d6d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,213 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc47 = loc("in_ptr0"(#loc)) +#loc48 = loc("out_ptr0"(#loc)) +#loc49 = loc("ks0"(#loc)) +#loc50 = loc("ks1"(#loc)) +#loc51 = loc("xnumel"(#loc)) +#loc52 = loc("r0_numel"(#loc)) +#loc53 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc53) + %cst = arith.constant dense : tensor<64x64xi1> loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x64xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<64x64xi32> loc(#loc54) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<64x64xf32> loc(#loc55) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xoffset_3 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc57) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc58) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc59) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc60) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc60) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc61) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc61) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc62) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc63) + %x0_8 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc63) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<64x1xi64> loc(#loc63) + %x1 = arith.divsi %x0, %x0_8 : tensor<64x1xi64> loc(#loc64) + %_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c64_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<64x64xf32>, tensor<64x64xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc66) + %r0_index_13 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc66) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x64xi32> loc(#loc67) + %tmp0 = arith.muli %x0_9, %cst_1 : tensor<64x1xi64> loc(#loc68) + %tmp0_14 = arith.extsi %r0_index_13 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc69) + %tmp0_15 = tt.broadcast %tmp0_14 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc69) + %tmp0_16 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc69) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<64x64xi64> loc(#loc69) + %tmp0_18 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc70) + %tmp0_19 = arith.muli %tmp0_18, %x1 : tensor<64x1xi64> loc(#loc70) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc71) + %tmp0_21 = arith.addi %tmp0_17, %tmp0_20 : tensor<64x64xi64> loc(#loc71) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc72) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> loc(#loc72) + %tmp0_24 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc73) + %tmp0_25 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc73) + %tmp0_26 = arith.andi %tmp0_24, %tmp0_25 : tensor<64x64xi1> loc(#loc73) + %tmp0_27 = tt.load %tmp0_23, %tmp0_26, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc74) + %mask = arith.cmpf ogt, %_tmp2_11, %tmp0_27 : tensor<64x64xf32> loc(#loc116) + %equal = arith.cmpf oeq, %_tmp2_11, %tmp0_27 : tensor<64x64xf32> loc(#loc117) + %a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<64x64xf32> loc(#loc96) + %b_isnan = arith.cmpf une, %tmp0_27, %tmp0_27 : tensor<64x64xf32> loc(#loc97) + %mask_28 = arith.xori %b_isnan, %cst : tensor<64x64xi1> loc(#loc98) + %mask_29 = arith.andi %a_isnan, %mask_28 : tensor<64x64xi1> loc(#loc99) + %mask_30 = arith.ori %mask, %mask_29 : tensor<64x64xi1> loc(#loc118) + %equal_31 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1> loc(#loc101) + %equal_32 = arith.ori %equal, %equal_31 : tensor<64x64xi1> loc(#loc119) + %mask_33 = tt.broadcast %r0_index_13 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc103) + %mask_34 = arith.cmpi slt, %_tmp2_index_12, %mask_33 : tensor<64x64xi32> loc(#loc103) + %mask_35 = arith.andi %equal_32, %mask_34 : tensor<64x64xi1> loc(#loc104) + %mask_36 = arith.ori %mask_30, %mask_35 : tensor<64x64xi1> loc(#loc105) + %4 = arith.select %mask_36, %_tmp2_11, %tmp0_27 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc87) + %5 = arith.select %mask_36, %_tmp2_index_12, %mask_33 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc88) + %_tmp2_37 = arith.select %tmp0_26, %4, %_tmp2_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc89) + %_tmp2_index_38 = arith.select %tmp0_26, %5, %_tmp2_index_12 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc90) + scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc41) + } loc(#loc93) + %0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc120) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc121) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc107) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc108) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc109) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc122) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc110) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc123) + %mask_16 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc111) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc112) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc113) + %4 = arith.select %mask_18, %arg6, %arg8 : f32 loc(#loc114) + %5 = arith.select %mask_18, %arg7, %arg9 : i32 loc(#loc115) + tt.reduce.return %4, %5 : f32, i32 loc(#loc91) + }) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc91) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc92) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc44) + %2 = tt.addptr %1, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc44) + %3 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc45) + tt.store %2, %3, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc45) + tt.return loc(#loc46) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc54 = loc("_tmp2_index"(#loc4)) +#loc55 = loc("_tmp2"(#loc5)) +#loc56 = loc("xoffset"(#loc6)) +#loc57 = loc("xoffset"(#loc7)) +#loc58 = loc("xindex"(#loc8)) +#loc59 = loc("xindex"(#loc9)) +#loc60 = loc("xindex"(#loc10)) +#loc61 = loc("xmask"(#loc11)) +#loc62 = loc("r0_base"(#loc12)) +#loc63 = loc("x0"(#loc13)) +#loc64 = loc("x1"(#loc14)) +#loc65 = loc("_tmp2"(#loc3)) +#loc66 = loc("r0_index"(#loc15)) +#loc67 = loc("r0_mask"(#loc16)) +#loc68 = loc("tmp0"(#loc17)) +#loc69 = loc("tmp0"(#loc18)) +#loc70 = loc("tmp0"(#loc19)) +#loc71 = loc("tmp0"(#loc20)) +#loc72 = loc("tmp0"(#loc21)) +#loc73 = loc("tmp0"(#loc22)) +#loc74 = loc("tmp0"(#loc23)) +#loc75 = loc("mask"(#loc24)) +#loc76 = loc("equal"(#loc26)) +#loc77 = loc("a_isnan"(#loc27)) +#loc78 = loc("b_isnan"(#loc28)) +#loc79 = loc("mask"(#loc29)) +#loc80 = loc("mask"(#loc30)) +#loc81 = loc("mask"(#loc31)) +#loc82 = loc("equal"(#loc32)) +#loc83 = loc("equal"(#loc33)) +#loc84 = loc("mask"(#loc34)) +#loc85 = loc("mask"(#loc35)) +#loc86 = loc("mask"(#loc36)) +#loc87 = loc(callsite(#loc37 at #loc25)) +#loc88 = loc(callsite(#loc38 at #loc25)) +#loc89 = loc("_tmp2"(#loc39)) +#loc90 = loc("_tmp2_index"(#loc40)) +#loc91 = loc(callsite(#loc42 at #loc2)) +#loc92 = loc("tmp2"(#loc43)) +#loc93 = loc("_tmp2_index"(#loc65)) +#loc94 = loc("mask"(#loc75)) +#loc95 = loc("equal"(#loc76)) +#loc96 = loc(callsite(#loc77 at #loc25)) +#loc97 = loc(callsite(#loc78 at #loc25)) +#loc98 = loc(callsite(#loc79 at #loc25)) +#loc99 = loc(callsite(#loc80 at #loc25)) +#loc100 = loc("mask"(#loc81)) +#loc101 = loc(callsite(#loc82 at #loc25)) +#loc102 = loc("equal"(#loc83)) +#loc103 = loc(callsite(#loc84 at #loc25)) +#loc104 = loc(callsite(#loc85 at #loc25)) +#loc105 = loc(callsite(#loc86 at #loc25)) +#loc106 = loc(callsite(#loc77 at #loc91)) +#loc107 = loc(callsite(#loc78 at #loc91)) +#loc108 = loc(callsite(#loc79 at #loc91)) +#loc109 = loc(callsite(#loc80 at #loc91)) +#loc110 = loc(callsite(#loc82 at #loc91)) +#loc111 = loc(callsite(#loc84 at #loc91)) +#loc112 = loc(callsite(#loc85 at #loc91)) +#loc113 = loc(callsite(#loc86 at #loc91)) +#loc114 = loc(callsite(#loc37 at #loc91)) +#loc115 = loc(callsite(#loc38 at #loc91)) +#loc116 = loc(callsite(#loc94 at #loc25)) +#loc117 = loc(callsite(#loc95 at #loc25)) +#loc118 = loc(callsite(#loc100 at #loc25)) +#loc119 = loc(callsite(#loc102 at #loc25)) +#loc120 = loc(callsite(#loc94 at #loc91)) +#loc121 = loc(callsite(#loc95 at #loc91)) +#loc122 = loc(callsite(#loc100 at #loc91)) +#loc123 = loc(callsite(#loc102 at #loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2bbba8fd12d0f47d217b532c2a9bf1381c3819b7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..dcbb8a33ec7e90ecf9307f7897532c82e07bd4a4 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..ac8213d6264040f8488f8318d3436c5a53088f98 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir @@ -0,0 +1,318 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %15 = icmp slt i32 %14, %9, !dbg !8 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %17 = and i32 %16, 384, !dbg !9 + %18 = zext nneg i32 %14 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !10 + %19 = sdiv i64 %18, %.frozen, !dbg !10 + %20 = srem i64 %19, %4, !dbg !11 + %21 = mul i64 %19, %.frozen, !dbg !12 + %.decomposed = sub i64 %18, %21, !dbg !12 + %22 = sdiv i64 %18, %7, !dbg !13 + %23 = shl nsw i64 %20, 7, !dbg !14 + %24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16 + %26 = and i32 %16, 127 + %27 = zext nneg i32 %26 to i64 + %28 = or disjoint i64 %24, %27 + %29 = icmp slt i64 %28, %6 + %30 = icmp sge i64 %28, %8 + %31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0) + %32 = sub nsw i64 %.decomposed, %20 + %33 = shl nsw i64 %32, 7 + %34 = zext nneg i32 %17 to i64, !dbg !17 + %35 = zext nneg i32 %26 to i64, !dbg !17 + %36 = zext nneg i32 %16 to i64, !dbg !17 + %37 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !18 + %38 = shufflevector <2 x i1> %37, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !18 + %39 = insertelement <2 x i1> poison, i1 %29, i64 0, !dbg !19 + %40 = shufflevector <2 x i1> %39, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !19 + %41 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !20 + %42 = shufflevector <2 x i64> %41, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !20 + %43 = insertelement <2 x i64> poison, i64 %5, i64 0, !dbg !21 + %44 = shufflevector <2 x i64> %43, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !21 + %45 = insertelement <2 x i64> poison, i64 %28, i64 0, !dbg !22 + %46 = shufflevector <2 x i64> %45, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !22 + %47 = insertelement <2 x i1> poison, i1 %30, i64 0, !dbg !23 + %48 = shufflevector <2 x i1> %47, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23 + %49 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !24 + %50 = shufflevector <2 x i64> %49, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !24 + %51 = insertelement <2 x i64> poison, i64 %8, i64 0, !dbg !25 + %52 = shufflevector <2 x i64> %51, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !25 + br label %53, !dbg !17 + +53: ; preds = %13, %53 + %indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ] + %54 = phi <2 x i64> [ zeroinitializer, %13 ], [ %113, %53 ] + %55 = or disjoint i64 %indvars.iv, %34, !dbg !26 + %56 = or disjoint i64 %indvars.iv, %36, !dbg !26 + %57 = lshr exact i64 %55, 7, !dbg !27 + %58 = lshr i64 %56, 7, !dbg !27 + %59 = trunc nuw nsw i64 %58 to i32, !dbg !27 + %60 = or i32 %59, 4, !dbg !27 + %61 = zext nneg i32 %60 to i64, !dbg !20 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %63 = sub nsw i64 %35, %57, !dbg !29 + %64 = sub nsw i32 %26, %60, !dbg !29 + %65 = sext i32 %64 to i64, !dbg !30 + %66 = insertelement <2 x i64> poison, i64 %57, i64 0, !dbg !20 + %67 = insertelement <2 x i64> %66, i64 %61, i64 1, !dbg !20 + %68 = or disjoint <2 x i64> %42, %67, !dbg !20 + %69 = icmp slt <2 x i64> %68, %44, !dbg !21 + %70 = and <2 x i1> %40, %69, !dbg !19 + %71 = icmp sge <2 x i64> %68, %46, !dbg !22 + %72 = extractelement <2 x i1> %70, i64 0, !dbg !31 + %73 = and i1 %15, %72, !dbg !31 + %74 = extractelement <2 x i1> %70, i64 1, !dbg !31 + %75 = and i1 %15, %74, !dbg !31 + %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %62, i1 %73) #5, !dbg !28 + %77 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %77, i1 %75) #5, !dbg !28 + %79 = insertelement <2 x i64> poison, i64 %76, i64 0, !dbg !32 + %80 = insertelement <2 x i64> %79, i64 %78, i64 1, !dbg !32 + %81 = icmp slt <2 x i64> %46, %80, !dbg !32 + %82 = icmp slt <2 x i64> %68, %80, !dbg !33 + %83 = and <2 x i1> %81, %82, !dbg !34 + %84 = and <2 x i1> %71, %83, !dbg !35 + %85 = srem i64 %28, %8, !dbg !36 + %.not = icmp eq i64 %85, 0, !dbg !37 + %86 = select i1 %.not, i64 0, i64 %31, !dbg !38 + %87 = add nsw i64 %86, %85, !dbg !38 + %88 = insertelement <2 x i64> poison, i64 %87, i64 0, !dbg !39 + %89 = shufflevector <2 x i64> %88, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !39 + %90 = icmp slt <2 x i64> %89, %80, !dbg !39 + %91 = insertelement <2 x i64> poison, i64 %63, i64 0, !dbg !24 + %92 = insertelement <2 x i64> %91, i64 %65, i64 1, !dbg !24 + %93 = add nsw <2 x i64> %50, %92, !dbg !24 + %94 = srem <2 x i64> %93, %52, !dbg !25 + %95 = icmp ne <2 x i64> %94, zeroinitializer, !dbg !40 + %96 = extractelement <2 x i64> %94, i64 0, !dbg !41 + %97 = xor i64 %96, %8, !dbg !41 + %98 = extractelement <2 x i64> %94, i64 1, !dbg !41 + %99 = xor i64 %98, %8, !dbg !41 + %100 = insertelement <2 x i64> poison, i64 %97, i64 0, !dbg !41 + %101 = insertelement <2 x i64> %100, i64 %99, i64 1, !dbg !41 + %102 = icmp slt <2 x i64> %101, zeroinitializer, !dbg !41 + %103 = and <2 x i1> %95, %102, !dbg !42 + %104 = select <2 x i1> %103, <2 x i64> %52, <2 x i64> zeroinitializer, !dbg !43 + %105 = sub <2 x i64> zeroinitializer, %104, !dbg !44 + %106 = icmp eq <2 x i64> %94, %105, !dbg !44 + %107 = and <2 x i1> %90, %106, !dbg !23 + %108 = and <2 x i1> %48, %107, !dbg !23 + %109 = or <2 x i1> %84, %108, !dbg !45 + %110 = select <2 x i1> %38, <2 x i1> %70, <2 x i1> zeroinitializer, !dbg !18 + %111 = select <2 x i1> %110, <2 x i1> %109, <2 x i1> zeroinitializer, !dbg !18 + %112 = zext <2 x i1> %111 to <2 x i64>, !dbg !18 + %113 = add <2 x i64> %54, %112, !dbg !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1024, !dbg !17 + %114 = icmp samesign ult i64 %indvars.iv, 15360, !dbg !17 + br i1 %114, label %53, label %115, !dbg !17 + +115: ; preds = %53 + %116 = and i32 %16, 31, !dbg !9 + %117 = lshr i32 %16, 5, !dbg !9 + %shift = shufflevector <2 x i64> %113, <2 x i64> poison, <2 x i32> , !dbg !46 + %foldExtExtBinop = add <2 x i64> %113, %shift, !dbg !46 + %118 = extractelement <2 x i64> %foldExtExtBinop, i64 0, !dbg !46 + %119 = bitcast <2 x i64> %foldExtExtBinop to <4 x i32>, !dbg !50 + %120 = extractelement <4 x i32> %119, i64 1, !dbg !50 + %121 = trunc i64 %118 to i32, !dbg !50 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !50 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !50 + %124 = insertelement <2 x i32> poison, i32 %122, i64 0, !dbg !50 + %125 = insertelement <2 x i32> %124, i32 %123, i64 1, !dbg !50 + %126 = bitcast <2 x i32> %125 to i64, !dbg !50 + %127 = add i64 %118, %126, !dbg !46 + %extelt.offset1 = lshr i64 %127, 32, !dbg !50 + %128 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50 + %129 = trunc i64 %127 to i32, !dbg !50 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !50 + %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !50 + %132 = insertelement <2 x i32> poison, i32 %130, i64 0, !dbg !50 + %133 = insertelement <2 x i32> %132, i32 %131, i64 1, !dbg !50 + %134 = bitcast <2 x i32> %133 to i64, !dbg !50 + %135 = add i64 %127, %134, !dbg !46 + %extelt.offset2 = lshr i64 %135, 32, !dbg !50 + %136 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50 + %137 = trunc i64 %135 to i32, !dbg !50 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 4, i32 31), !dbg !50 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !50 + %140 = insertelement <2 x i32> poison, i32 %138, i64 0, !dbg !50 + %141 = insertelement <2 x i32> %140, i32 %139, i64 1, !dbg !50 + %142 = bitcast <2 x i32> %141 to i64, !dbg !50 + %143 = add i64 %135, %142, !dbg !46 + %extelt.offset3 = lshr i64 %143, 32, !dbg !50 + %144 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50 + %145 = trunc i64 %143 to i32, !dbg !50 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 2, i32 31), !dbg !50 + %147 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 2, i32 31), !dbg !50 + %148 = insertelement <2 x i32> poison, i32 %146, i64 0, !dbg !50 + %149 = insertelement <2 x i32> %148, i32 %147, i64 1, !dbg !50 + %150 = bitcast <2 x i32> %149 to i64, !dbg !50 + %151 = add i64 %143, %150, !dbg !46 + %extelt.offset4 = lshr i64 %151, 32, !dbg !50 + %152 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50 + %153 = trunc i64 %151 to i32, !dbg !50 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 1, i32 31), !dbg !50 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !50 + %156 = insertelement <2 x i32> poison, i32 %154, i64 0, !dbg !50 + %157 = insertelement <2 x i32> %156, i32 %155, i64 1, !dbg !50 + %158 = bitcast <2 x i32> %157 to i64, !dbg !50 + %159 = add i64 %151, %158, !dbg !46 + %160 = and i32 %117, 15, !dbg !50 + %161 = icmp eq i32 %116, 0, !dbg !50 + %162 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %160, !dbg !50 + %163 = insertelement <1 x i64> poison, i64 %159, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %162, <1 x i64> %163, i1 %161) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %164 = icmp samesign ult i32 %16, 16, !dbg !50 + %165 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50 + %166 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %165, i1 %164) #5, !dbg !50 + %extelt.offset5 = lshr i64 %166, 32, !dbg !50 + %167 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50 + %168 = trunc i64 %166 to i32, !dbg !50 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 8, i32 31), !dbg !50 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 8, i32 31), !dbg !50 + %171 = insertelement <2 x i32> poison, i32 %169, i64 0, !dbg !50 + %172 = insertelement <2 x i32> %171, i32 %170, i64 1, !dbg !50 + %173 = bitcast <2 x i32> %172 to i64, !dbg !50 + %174 = add i64 %166, %173, !dbg !46 + %extelt.offset6 = lshr i64 %174, 32, !dbg !50 + %175 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50 + %176 = trunc i64 %174 to i32, !dbg !50 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 4, i32 31), !dbg !50 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !50 + %179 = insertelement <2 x i32> poison, i32 %177, i64 0, !dbg !50 + %180 = insertelement <2 x i32> %179, i32 %178, i64 1, !dbg !50 + %181 = bitcast <2 x i32> %180 to i64, !dbg !50 + %182 = add i64 %174, %181, !dbg !46 + %extelt.offset7 = lshr i64 %182, 32, !dbg !50 + %183 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50 + %184 = trunc i64 %182 to i32, !dbg !50 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 2, i32 31), !dbg !50 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 2, i32 31), !dbg !50 + %187 = insertelement <2 x i32> poison, i32 %185, i64 0, !dbg !50 + %188 = insertelement <2 x i32> %187, i32 %186, i64 1, !dbg !50 + %189 = bitcast <2 x i32> %188 to i64, !dbg !50 + %190 = add i64 %182, %189, !dbg !46 + %extelt.offset8 = lshr i64 %190, 32, !dbg !50 + %191 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50 + %192 = trunc i64 %190 to i32, !dbg !50 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !50 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 1, i32 31), !dbg !50 + %195 = insertelement <2 x i32> poison, i32 %193, i64 0, !dbg !50 + %196 = insertelement <2 x i32> %195, i32 %194, i64 1, !dbg !50 + %197 = bitcast <2 x i32> %196 to i64, !dbg !50 + %198 = add i64 %190, %197, !dbg !46 + %199 = icmp eq i32 %16, 0, !dbg !50 + %200 = insertelement <1 x i64> poison, i64 %198, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %165, <1 x i64> %200, i1 %199) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %201 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50 + %202 = add i64 %201, -1, !dbg !51 + %203 = icmp ult i64 %202, 16383, !dbg !51 + %204 = zext i1 %203 to i32, !dbg !52 + %205 = icmp eq i64 %201, 16384, !dbg !53 + %206 = zext i1 %205 to i32, !dbg !52 + %207 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54 + %208 = and i32 %16, 511, !dbg !55 + %209 = icmp eq i32 %208, 0, !dbg !55 + %210 = and i1 %209, %15, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %204, ptr addrspace(1) %207, i1 %210) #5, !dbg !55 + %211 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %206, ptr addrspace(1) %211, i1 %210) #5, !dbg !57 + ret void, !dbg !58 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.smin.i64(i64, i64) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 21, scope: !4) +!11 = !DILocation(line: 27, column: 28, scope: !4) +!12 = !DILocation(line: 28, column: 19, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 39, column: 26, scope: !4) +!15 = !DILocation(line: 42, column: 26, scope: !4) +!16 = !DILocation(line: 49, column: 35, scope: !4) +!17 = !DILocation(line: 32, column: 40, scope: !4) +!18 = !DILocation(line: 86, column: 50, scope: !4) +!19 = !DILocation(line: 45, column: 22, scope: !4) +!20 = !DILocation(line: 39, column: 22, scope: !4) +!21 = !DILocation(line: 41, column: 22, scope: !4) +!22 = !DILocation(line: 48, column: 23, scope: !4) +!23 = !DILocation(line: 79, column: 24, scope: !4) +!24 = !DILocation(line: 69, column: 51, scope: !4) +!25 = !DILocation(line: 70, column: 25, scope: !4) +!26 = !DILocation(line: 33, column: 31, scope: !4) +!27 = !DILocation(line: 37, column: 27, scope: !4) +!28 = !DILocation(line: 49, column: 77, scope: !4) +!29 = !DILocation(line: 69, column: 24, scope: !4) +!30 = !DILocation(line: 69, column: 38, scope: !4) +!31 = !DILocation(line: 49, column: 94, scope: !4) +!32 = !DILocation(line: 50, column: 23, scope: !4) +!33 = !DILocation(line: 51, column: 23, scope: !4) +!34 = !DILocation(line: 52, column: 24, scope: !4) +!35 = !DILocation(line: 53, column: 23, scope: !4) +!36 = !DILocation(line: 58, column: 24, scope: !4) +!37 = !DILocation(line: 60, column: 25, scope: !4) +!38 = !DILocation(line: 66, column: 39, scope: !4) +!39 = !DILocation(line: 67, column: 24, scope: !4) +!40 = !DILocation(line: 71, column: 25, scope: !4) +!41 = !DILocation(line: 73, column: 25, scope: !4) +!42 = !DILocation(line: 74, column: 24, scope: !4) +!43 = !DILocation(line: 76, column: 39, scope: !4) +!44 = !DILocation(line: 78, column: 25, scope: !4) +!45 = !DILocation(line: 80, column: 24, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49) +!47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 87, column: 27, scope: !4) +!50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49) +!51 = !DILocation(line: 92, column: 20, scope: !4) +!52 = !DILocation(line: 0, scope: !4) +!53 = !DILocation(line: 95, column: 21, scope: !4) +!54 = !DILocation(line: 98, column: 25, scope: !4) +!55 = !DILocation(line: 98, column: 37, scope: !4) +!56 = !DILocation(line: 99, column: 25, scope: !4) +!57 = !DILocation(line: 99, column: 37, scope: !4) +!58 = !DILocation(line: 99, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..041263f4601592b706bfab57f2eb4d0f2575a718 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx @@ -0,0 +1,736 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12 +) +.reqntid 512 +{ + .reg .pred %p<53>; + .reg .b32 %r<76>; + .reg .b64 %rd<162>; + .loc 1 18 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:18:0 + +// %bb.0: + ld.param.b64 %rd47, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4]; +$L__tmp0: + .loc 1 22 28 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:22:28 + mov.u32 %r7, %ctaid.x; + .loc 1 27 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:27:21 + cvt.u64.u32 %rd1, %r7; + ld.param.b64 %rd52, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3]; + and.b64 %rd53, %rd52, -4294967296; + setp.ne.b64 %p11, %rd53, 0; + cvt.u32.u64 %r74, %rd1; + @%p11 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd153, %rd1, %rd52; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r8, %rd52; + div.u32 %r10, %r74, %r8; + cvt.u64.u32 %rd153, %r10; +$L__BB0_3: + .loc 1 0 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0:21 + ld.param.b64 %rd50, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7]; + .loc 1 27 28 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:27:28 + or.b64 %rd54, %rd153, %rd47; + and.b64 %rd55, %rd54, -4294967296; + setp.ne.b64 %p12, %rd55, 0; + @%p12 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd154, %rd153, %rd47; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r11, %rd47; + cvt.u32.u64 %r12, %rd153; + rem.u32 %r13, %r12, %r11; + cvt.u64.u32 %rd154, %r13; +$L__BB0_6: + .loc 1 0 28 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0:28 + ld.param.b32 %r6, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9]; + ld.param.b64 %rd51, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8]; + ld.param.b64 %rd49, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6]; + ld.param.b64 %rd44, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0]; + mov.u32 %r1, %tid.x; + .loc 1 28 19 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:28:19 + mul.lo.s64 %rd56, %rd153, %rd52; + sub.s64 %rd9, %rd1, %rd56; + .loc 1 29 19 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:29:19 + and.b64 %rd57, %rd50, -4294967296; + setp.ne.b64 %p13, %rd57, 0; + @%p13 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd155, %rd1, %rd50; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r14, %rd50; + div.u32 %r16, %r74, %r14; + cvt.u64.u32 %rd155, %r16; +$L__BB0_9: + .loc 1 0 19 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0:19 + ld.param.b64 %rd48, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5]; + ld.param.b64 %rd46, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2]; + ld.param.b64 %rd45, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1]; + .loc 1 24 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:24:21 + setp.lt.s32 %p1, %r74, %r6; + .loc 1 39 26 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:39:26 + shl.b64 %rd16, %rd154, 7; + .loc 1 42 26 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:42:26 + shl.b64 %rd61, %rd9, 7; + .loc 1 49 35 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:35 + shl.b64 %rd62, %rd155, 3; + add.s64 %rd70, %rd44, %rd62; + and.b32 %r2, %r1, 127; + cvt.u64.u32 %rd63, %r2; + or.b64 %rd20, %rd61, %rd63; + setp.lt.s64 %p3, %rd20, %rd49; + setp.ge.s64 %p5, %rd20, %rd51; + min.s64 %rd15, %rd51, 0; + sub.s64 %rd64, %rd9, %rd154; + shl.b64 %rd22, %rd64, 7; + .loc 1 32 40 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:32:40 + cvt.u64.u32 %rd65, %r1; + shr.u64 %rd66, %rd65, 7; + cvt.u32.u64 %r75, %rd66; + shr.u32 %r18, %r1, 7; + cvt.u64.u32 %rd67, %r18; + and.b64 %rd157, %rd67, 3; + sub.s64 %rd156, %rd63, %rd157; + mov.b64 %rd159, 0; + mov.b64 %rd158, -1024; + mov.b64 %rd160, %rd159; + bra.uni $L__BB0_10; +$L__BB0_12: // in Loop: Header=BB0_10 Depth=1 + .loc 1 58 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:58:24 + rem.s64 %rd161, %rd20, %rd51; +$L__BB0_13: // in Loop: Header=BB0_10 Depth=1 + .loc 1 0 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0 + sub.s32 %r21, %r2, %r20; + cvt.s64.s32 %rd33, %r21; + .loc 1 60 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:60:25 + setp.eq.b64 %p24, %rd161, 0; + .loc 1 66 39 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:66:39 + selp.b64 %rd83, 0, %rd15, %p24; + add.s64 %rd84, %rd83, %rd161; + .loc 1 67 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:67:24 + setp.lt.s64 %p25, %rd84, %rd69; + setp.lt.s64 %p26, %rd84, %rd73; + .loc 1 69 51 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:69:51 + add.s64 %rd85, %rd22, %rd33; + add.s64 %rd86, %rd22, %rd156; + .loc 1 70 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:70:25 + rem.s64 %rd87, %rd86, %rd51; + rem.s64 %rd88, %rd85, %rd51; + .loc 1 71 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:71:25 + setp.ne.b64 %p27, %rd88, 0; + setp.ne.b64 %p28, %rd87, 0; + .loc 1 73 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:73:25 + xor.b64 %rd89, %rd87, %rd51; + xor.b64 %rd90, %rd88, %rd51; + .loc 1 76 39 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:76:39 + shr.s64 %rd91, %rd89, 63; + and.b64 %rd92, %rd91, %rd51; + selp.b64 %rd93, %rd92, 0, %p28; + shr.s64 %rd94, %rd90, 63; + and.b64 %rd95, %rd94, %rd51; + selp.b64 %rd96, %rd95, 0, %p27; + .loc 1 78 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:78:25 + neg.s64 %rd97, %rd96; + neg.s64 %rd98, %rd93; + setp.eq.b64 %p29, %rd87, %rd98; + setp.eq.b64 %p30, %rd88, %rd97; + .loc 1 79 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:79:24 + and.pred %p31, %p26, %p30; + and.pred %p33, %p25, %p29; + and.pred %p35, %p5, %p33; + and.pred %p36, %p5, %p31; + .loc 1 80 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:80:24 + or.pred %p37, %p10, %p36; + or.pred %p38, %p9, %p35; + .loc 1 86 50 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:86:50 + and.pred %p41, %p14, %p38; + and.pred %p42, %p15, %p37; + selp.b64 %rd99, 1, 0, %p42; + selp.b64 %rd100, 1, 0, %p41; + add.s64 %rd159, %rd159, %rd100; + add.s64 %rd160, %rd160, %rd99; + .loc 1 32 40 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:32:40 + add.s64 %rd158, %rd158, 1024; + add.s32 %r75, %r75, 8; + add.s64 %rd157, %rd157, 8; + add.s64 %rd156, %rd156, -8; + setp.lt.u64 %p43, %rd158, 15360; + @%p43 bra $L__BB0_10; + bra.uni $L__BB0_14; +$L__BB0_10: // =>This Inner Loop Header: Depth=1 + .loc 1 37 27 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:37:27 + or.b32 %r20, %r75, 4; + .loc 1 39 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:39:22 + cvt.u64.u32 %rd76, %r20; + .loc 1 49 77 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:77 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + .loc 1 39 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:39:22 + or.b64 %rd77, %rd16, %rd76; + or.b64 %rd78, %rd16, %rd157; + .loc 1 41 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:41:22 + setp.lt.s64 %p17, %rd78, %rd48; + setp.lt.s64 %p18, %rd77, %rd48; + .loc 1 45 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:45:22 + and.pred %p8, %p3, %p18; + and.pred %p7, %p3, %p17; + .loc 1 48 23 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:48:23 + setp.ge.s64 %p19, %rd77, %rd20; + setp.ge.s64 %p20, %rd78, %rd20; + .loc 1 49 94 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:94 + and.pred %p14, %p1, %p7; + and.pred %p15, %p1, %p8; + .loc 1 49 77 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:77 + // begin inline asm + mov.u64 %rd69, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd69 }, [ %rd70 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd73 }, [ %rd70 + 0 ], %rd72; + // end inline asm + .loc 1 52 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:52:24 + max.s64 %rd79, %rd20, %rd77; + setp.lt.s64 %p21, %rd79, %rd73; + max.s64 %rd80, %rd20, %rd78; + setp.lt.s64 %p22, %rd80, %rd69; + .loc 1 53 23 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:53:23 + and.pred %p9, %p20, %p22; + and.pred %p10, %p19, %p21; + .loc 1 58 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:58:24 + or.b64 %rd81, %rd20, %rd51; + and.b64 %rd82, %rd81, -4294967296; + setp.ne.b64 %p23, %rd82, 0; + @%p23 bra $L__BB0_12; +// %bb.11: // in Loop: Header=BB0_10 Depth=1 + cvt.u32.u64 %r22, %rd51; + cvt.u32.u64 %r23, %rd20; + rem.u32 %r24, %r23, %r22; + cvt.u64.u32 %rd161, %r24; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 25 37 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:25:37 + and.b32 %r31, %r1, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd106, %rd159, %rd160; + mov.b64 {_, %r32}, %rd106; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + cvt.u32.u64 %r33, %rd106; + shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 16, 31, -1; + cvt.u64.u32 %rd107, %r34; + cvt.u64.u32 %rd108, %r35; + shl.b64 %rd109, %rd108, 32; + or.b64 %rd110, %rd107, %rd109; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd111, %rd106, %rd110; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r36}, %rd111; + cvt.u32.u64 %r37, %rd111; + shfl.sync.bfly.b32 %r38, %r37, 8, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 8, 31, -1; + cvt.u64.u32 %rd112, %r38; + cvt.u64.u32 %rd113, %r39; + shl.b64 %rd114, %rd113, 32; + or.b64 %rd115, %rd112, %rd114; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd116, %rd111, %rd115; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r40}, %rd116; + cvt.u32.u64 %r41, %rd116; + shfl.sync.bfly.b32 %r42, %r41, 4, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 4, 31, -1; + cvt.u64.u32 %rd117, %r42; + cvt.u64.u32 %rd118, %r43; + shl.b64 %rd119, %rd118, 32; + or.b64 %rd120, %rd117, %rd119; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd121, %rd116, %rd120; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r44}, %rd121; + cvt.u32.u64 %r45, %rd121; + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + shfl.sync.bfly.b32 %r47, %r44, 2, 31, -1; + cvt.u64.u32 %rd122, %r46; + cvt.u64.u32 %rd123, %r47; + shl.b64 %rd124, %rd123, 32; + or.b64 %rd125, %rd122, %rd124; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd126, %rd121, %rd125; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r48}, %rd126; + cvt.u32.u64 %r49, %rd126; + shfl.sync.bfly.b32 %r50, %r49, 1, 31, -1; + shfl.sync.bfly.b32 %r51, %r48, 1, 31, -1; + cvt.u64.u32 %rd127, %r50; + cvt.u64.u32 %rd128, %r51; + shl.b64 %rd129, %rd128, 32; + or.b64 %rd130, %rd127, %rd129; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd101, %rd126, %rd130; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + setp.eq.b32 %p44, %r31, 0; + shr.u32 %r52, %r1, 2; + and.b32 %r53, %r52, 120; + mov.b32 %r54, global_smem; + add.s32 %r25, %r54, %r53; + // begin inline asm + @%p44 st.shared.b64 [ %r25 + 0 ], %rd101; + // end inline asm + bar.sync 0; + setp.lt.u32 %p45, %r1, 16; + shl.b32 %r55, %r1, 3; + add.s32 %r26, %r54, %r55; + // begin inline asm + @%p45 ld.shared.b64 %rd102, [ %r26 + 0 ]; + // end inline asm + mov.b64 {_, %r56}, %rd102; + cvt.u32.u64 %r57, %rd102; + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + shfl.sync.bfly.b32 %r59, %r56, 8, 31, -1; + cvt.u64.u32 %rd131, %r58; + cvt.u64.u32 %rd132, %r59; + shl.b64 %rd133, %rd132, 32; + or.b64 %rd134, %rd131, %rd133; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd135, %rd102, %rd134; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r60}, %rd135; + cvt.u32.u64 %r61, %rd135; + shfl.sync.bfly.b32 %r62, %r61, 4, 31, -1; + shfl.sync.bfly.b32 %r63, %r60, 4, 31, -1; + cvt.u64.u32 %rd136, %r62; + cvt.u64.u32 %rd137, %r63; + shl.b64 %rd138, %rd137, 32; + or.b64 %rd139, %rd136, %rd138; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd140, %rd135, %rd139; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r64}, %rd140; + cvt.u32.u64 %r65, %rd140; + shfl.sync.bfly.b32 %r66, %r65, 2, 31, -1; + shfl.sync.bfly.b32 %r67, %r64, 2, 31, -1; + cvt.u64.u32 %rd141, %r66; + cvt.u64.u32 %rd142, %r67; + shl.b64 %rd143, %rd142, 32; + or.b64 %rd144, %rd141, %rd143; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd145, %rd140, %rd144; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + mov.b64 {_, %r68}, %rd145; + cvt.u32.u64 %r69, %rd145; + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + shfl.sync.bfly.b32 %r71, %r68, 1, 31, -1; + cvt.u64.u32 %rd146, %r70; + cvt.u64.u32 %rd147, %r71; + shl.b64 %rd148, %rd147, 32; + or.b64 %rd149, %rd146, %rd148; + .loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + add.s64 %rd103, %rd145, %rd149; + .loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ] + setp.eq.b32 %p46, %r1, 0; + // begin inline asm + @%p46 st.shared.b64 [ %r26 + 0 ], %rd103; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd150, [global_smem]; +$L__tmp2: + .loc 1 92 20 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:92:20 + add.s64 %rd151, %rd150, -1; + setp.lt.u64 %p50, %rd151, 16383; + .loc 1 0 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0 + selp.b32 %r28, 1, 0, %p50; + .loc 1 95 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:95:21 + setp.eq.b64 %p51, %rd150, 16384; + .loc 1 0 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0 + selp.b32 %r29, 1, 0, %p51; + .loc 1 98 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:98:25 + shl.b64 %rd152, %rd1, 2; + add.s64 %rd104, %rd45, %rd152; + .loc 1 98 37 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:98:37 + and.b32 %r72, %r1, 511; + setp.eq.b32 %p52, %r72, 0; + and.pred %p47, %p52, %p1; + // begin inline asm + @%p47 st.global.b32 [ %rd104 + 0 ], { %r28 }; + // end inline asm + .loc 1 99 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:99:25 + add.s64 %rd105, %rd46, %rd152; + .loc 1 99 37 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:99:37 + // begin inline asm + @%p47 st.global.b32 [ %rd105 + 0 ], { %r29 }; + // end inline asm + .loc 1 99 4 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:99:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 120 +.b8 116 +.b8 101 +.b8 122 +.b8 120 +.b8 52 +.b8 52 +.b8 118 +.b8 109 +.b8 113 +.b8 104 +.b8 54 +.b8 50 +.b8 50 +.b8 102 +.b8 51 +.b8 116 +.b8 112 +.b8 109 +.b8 97 +.b8 107 +.b8 108 +.b8 111 +.b8 102 +.b8 53 +.b8 54 +.b8 98 +.b8 114 +.b8 52 +.b8 101 +.b8 121 +.b8 108 +.b8 116 +.b8 51 +.b8 110 +.b8 122 +.b8 52 +.b8 97 +.b8 52 +.b8 54 +.b8 107 +.b8 97 +.b8 118 +.b8 118 +.b8 122 +.b8 50 +.b8 103 +.b8 119 +.b8 113 +.b8 119 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 106 +.b8 120 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 99 +.b8 111 +.b8 110 +.b8 115 +.b8 116 +.b8 97 +.b8 110 +.b8 116 +.b8 95 +.b8 112 +.b8 97 +.b8 100 +.b8 95 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 87 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..f4310c5630ea33fa36d53f0b2611e3a0d9195306 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source @@ -0,0 +1,418 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":18:0) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc99 = loc(unknown) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc106 = loc("in_ptr0"(#loc)) +#loc107 = loc("out_ptr1"(#loc)) +#loc108 = loc("out_ptr2"(#loc)) +#loc109 = loc("ks0"(#loc)) +#loc110 = loc("ks1"(#loc)) +#loc111 = loc("ks2"(#loc)) +#loc112 = loc("ks3"(#loc)) +#loc113 = loc("ks4"(#loc)) +#loc114 = loc("ks5"(#loc)) +#loc115 = loc("xnumel"(#loc)) +#loc116 = loc("r0_numel"(#loc)) +#loc207 = loc("input"(#loc97)) +#loc208 = loc("a"(#loc102)) +#loc209 = loc("b"(#loc102)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16384 : i32 loc(#loc117) + %xoffset = tt.get_program_id x : i32 loc(#loc118) + %xoffset_1 = arith.constant 1 : i32 loc(#loc119) + %xoffset_2 = arith.constant 1 : i32 loc(#loc119) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc124) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc125) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126) + %x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126) + %x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126) + %x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127) + %x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128) + %x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128) + %x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129) + %x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129) + %x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129) + %_tmp46 = arith.constant 0 : i64 loc(#loc130) + %_tmp46_17 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc130) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c1024_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc132) + %r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x1024xi32> loc(#loc132) + %r0_mask = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc133) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x1024xi32> loc(#loc133) + %r0_4 = arith.constant 128 : i32 loc(#loc134) + %r0_4_25 = arith.constant 128 : i32 loc(#loc134) + %r0_4_26 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc134) + %r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x1024xi32> loc(#loc134) + %r0_3 = arith.constant 128 : i32 loc(#loc135) + %r0_3_28 = arith.constant 128 : i32 loc(#loc135) + %r0_3_29 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc135) + %r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x1024xi32> loc(#loc135) + %tmp0 = arith.constant 128 : i32 loc(#loc136) + %tmp0_31 = arith.constant 128 : i64 loc(#loc136) + %tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136) + %tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136) + %tmp0_34 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc137) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc137) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x1024xi64> loc(#loc137) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc138) + %tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x1024xi64> loc(#loc138) + %tmp3 = arith.constant 128 : i32 loc(#loc139) + %tmp3_38 = arith.constant 128 : i64 loc(#loc139) + %tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139) + %tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139) + %tmp3_41 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc140) + %tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc140) + %tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x1024xi64> loc(#loc140) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc141) + %tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x1024xi64> loc(#loc141) + %tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x1024xi1> loc(#loc142) + %tmp7 = arith.constant 128 : i32 loc(#loc143) + %tmp7_45 = arith.constant 128 : i64 loc(#loc143) + %tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143) + %tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143) + %tmp7_48 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc144) + %tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc144) + %tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x1024xi64> loc(#loc144) + %tmp8 = arith.constant 128 : i32 loc(#loc145) + %tmp8_51 = arith.constant 128 : i64 loc(#loc145) + %tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145) + %tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145) + %tmp8_54 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc146) + %tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc146) + %tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x1024xi64> loc(#loc146) + %tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x1024xi64> loc(#loc147) + %tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc148) + %tmp10_57 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc149) + %tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> loc(#loc149) + %tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x1024xi1> loc(#loc150) + %tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc151) + %tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x1024xi1> loc(#loc151) + %tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152) + %tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32> loc(#loc152) + %tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x1024xf32> to tensor<1x1024xi64> loc(#loc152) + %tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc152) + %tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x1024xi64> loc(#loc153) + %tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x1024xi64> loc(#loc154) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc155) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc156) + %tmp15 = arith.constant false loc(#loc157) + %tmp15_66 = arith.constant dense : tensor<1x1xi1> loc(#loc157) + %tmp16 = arith.constant dense : tensor<1x1024xi1> loc(#loc158) + %tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x1024xi1> loc(#loc158) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc159) + %tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc160) + %tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc161) + %tmp20 = arith.constant 0 : i32 loc(#loc162) + %tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162) + %tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163) + %tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc163) + %tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x1024xi64> loc(#loc163) + %tmp22 = arith.constant 0 : i32 loc(#loc164) + %tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164) + %tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x1024xi64> loc(#loc164) + %tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x1024xi64> loc(#loc164) + %tmp23 = arith.constant 0 : i32 loc(#loc165) + %tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165) + %tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x1024xi64> loc(#loc165) + %tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x1024xi64> loc(#loc165) + %tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x1024xi1> loc(#loc166) + %tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x1024xi1> loc(#loc167) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc168) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc169) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x1024xi64> loc(#loc170) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc171) + %tmp30 = arith.constant -1 : i32 loc(#loc172) + %tmp30_77 = arith.constant -1 : i32 loc(#loc172) + %tmp30_78 = arith.constant dense<-1> : tensor<1x1024xi32> loc(#loc172) + %tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x1024xi32> loc(#loc172) + %tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x1024xi32> loc(#loc173) + %tmp30_81 = arith.constant -128 : i32 loc(#loc174) + %tmp30_82 = arith.constant -128 : i64 loc(#loc174) + %tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174) + %tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174) + %tmp30_85 = arith.extsi %tmp30_80 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc175) + %tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc175) + %tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x1024xi64> loc(#loc175) + %tmp30_88 = arith.constant 128 : i32 loc(#loc176) + %tmp30_89 = arith.constant 128 : i64 loc(#loc176) + %tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176) + %tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176) + %tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc177) + %tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x1024xi64> loc(#loc177) + %tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x1024xi64> loc(#loc178) + %tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179) + %tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc179) + %tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x1024xi64> loc(#loc179) + %tmp33 = arith.constant 0 : i32 loc(#loc180) + %tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180) + %tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x1024xi64> loc(#loc180) + %tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x1024xi64> loc(#loc180) + %tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x1024xi1> loc(#loc181) + %tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x1024xi1> loc(#loc182) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc183) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc184) + %tmp38 = arith.constant 0 : i64 loc(#loc185) + %tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185) + %tmp39 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc186) + %tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x1024xi64> loc(#loc186) + %tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x1024xi1> loc(#loc187) + %tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x1024xi1> loc(#loc188) + %tmp42 = arith.constant false loc(#loc189) + %tmp42_101 = arith.constant dense : tensor<1x1024xi1> loc(#loc189) + %tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc190) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc191) + %tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x1024xi64> loc(#loc192) + %_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc193) + %_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x1024xi1> loc(#loc193) + %_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc194) + scf.yield %_tmp46_104 : tensor<1x1024xi64> loc(#loc79) + } loc(#loc131) + %tmp46 = tt.call @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc195) + %tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196) + %tmp48 = arith.constant 0 : i64 loc(#loc197) + %tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197) + %tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198) + %tmp50 = arith.constant 16384 : i64 loc(#loc199) + %tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199) + %tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201) + %tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202) + %tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203) + %tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204) + %tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205) + %tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc92) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc92) + tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc93) + %6 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc94) + %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc94) + tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc95) + tt.return loc(#loc96) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x1024xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98) + tt.reduce.return %2 : i64 loc(#loc98) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc98) + tt.return %0 : tensor<1xi64> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc101) + tt.return %1 : tensor<1xi64> loc(#loc101) + } loc(#loc97) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc103) + tt.return %0 : i64 loc(#loc104) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc105) + tt.return %1 : i64 loc(#loc105) + } loc(#loc102) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:28) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":30:44) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":32:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":37:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":38:27) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:26) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":41:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":44:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":45:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":46:26) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":46:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":47:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":47:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":48:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:55) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:87) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:94) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:77) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":50:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":51:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":52:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":53:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":54:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":55:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":56:37) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":57:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":58:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":59:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":60:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":61:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":62:92) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":63:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":64:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":65:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":66:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":67:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":68:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:45) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:38) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:51) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":70:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":71:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":72:92) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":73:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":74:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":75:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":76:39) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":77:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":78:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":79:24) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":80:24) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":81:44) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":82:38) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":85:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:36) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:50) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:8) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:27) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:30) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":88:31) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":89:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":90:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":91:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":92:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":93:21) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":94:21) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":95:21) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":96:21) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":97:21) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:37) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:37) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:4) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc117 = loc("r0_numel"(#loc1)) +#loc118 = loc("xoffset"(#loc2)) +#loc119 = loc("xoffset"(#loc3)) +#loc120 = loc("xindex"(#loc4)) +#loc121 = loc("xindex"(#loc5)) +#loc122 = loc("xindex"(#loc6)) +#loc123 = loc("xmask"(#loc7)) +#loc124 = loc("r0_base"(#loc8)) +#loc125 = loc("r0_base"(#loc9)) +#loc126 = loc("x1"(#loc10)) +#loc127 = loc("x1"(#loc11)) +#loc128 = loc("x0"(#loc12)) +#loc129 = loc("x2"(#loc13)) +#loc130 = loc("_tmp46"(#loc14)) +#loc131 = loc("_tmp46"(#loc15)) +#loc132 = loc("r0_index"(#loc16)) +#loc133 = loc("r0_mask"(#loc17)) +#loc134 = loc("r0_4"(#loc18)) +#loc135 = loc("r0_3"(#loc19)) +#loc136 = loc("tmp0"(#loc20)) +#loc137 = loc("tmp0"(#loc21)) +#loc138 = loc("tmp2"(#loc22)) +#loc139 = loc("tmp3"(#loc23)) +#loc140 = loc("tmp3"(#loc24)) +#loc141 = loc("tmp5"(#loc25)) +#loc142 = loc("tmp6"(#loc26)) +#loc143 = loc("tmp7"(#loc27)) +#loc144 = loc("tmp7"(#loc28)) +#loc145 = loc("tmp8"(#loc29)) +#loc146 = loc("tmp8"(#loc30)) +#loc147 = loc("tmp9"(#loc31)) +#loc148 = loc("tmp10"(#loc32)) +#loc149 = loc("tmp10"(#loc33)) +#loc150 = loc("tmp10"(#loc34)) +#loc151 = loc("tmp10"(#loc35)) +#loc152 = loc("tmp10"(#loc36)) +#loc153 = loc("tmp11"(#loc37)) +#loc154 = loc("tmp12"(#loc38)) +#loc155 = loc("tmp13"(#loc39)) +#loc156 = loc("tmp14"(#loc40)) +#loc157 = loc("tmp15"(#loc41)) +#loc158 = loc("tmp16"(#loc42)) +#loc159 = loc("tmp17"(#loc43)) +#loc160 = loc("tmp18"(#loc44)) +#loc161 = loc("tmp19"(#loc45)) +#loc162 = loc("tmp20"(#loc46)) +#loc163 = loc("tmp21"(#loc47)) +#loc164 = loc("tmp22"(#loc48)) +#loc165 = loc("tmp23"(#loc49)) +#loc166 = loc("tmp24"(#loc50)) +#loc167 = loc("tmp25"(#loc51)) +#loc168 = loc("tmp26"(#loc52)) +#loc169 = loc("tmp27"(#loc53)) +#loc170 = loc("tmp28"(#loc54)) +#loc171 = loc("tmp29"(#loc55)) +#loc172 = loc("tmp30"(#loc56)) +#loc173 = loc("tmp30"(#loc57)) +#loc174 = loc("tmp30"(#loc58)) +#loc175 = loc("tmp30"(#loc59)) +#loc176 = loc("tmp30"(#loc60)) +#loc177 = loc("tmp30"(#loc61)) +#loc178 = loc("tmp31"(#loc62)) +#loc179 = loc("tmp32"(#loc63)) +#loc180 = loc("tmp33"(#loc64)) +#loc181 = loc("tmp34"(#loc65)) +#loc182 = loc("tmp35"(#loc66)) +#loc183 = loc("tmp36"(#loc67)) +#loc184 = loc("tmp37"(#loc68)) +#loc185 = loc("tmp38"(#loc69)) +#loc186 = loc("tmp39"(#loc70)) +#loc187 = loc("tmp40"(#loc71)) +#loc188 = loc("tmp41"(#loc72)) +#loc189 = loc("tmp42"(#loc73)) +#loc190 = loc("tmp43"(#loc74)) +#loc191 = loc("tmp44"(#loc75)) +#loc192 = loc("tmp47"(#loc76)) +#loc193 = loc("_tmp46"(#loc77)) +#loc194 = loc("_tmp46"(#loc78)) +#loc195 = loc("tmp46"(#loc80)) +#loc196 = loc("tmp46"(#loc81)) +#loc197 = loc("tmp48"(#loc82)) +#loc198 = loc("tmp49"(#loc83)) +#loc199 = loc("tmp50"(#loc84)) +#loc200 = loc("tmp51"(#loc85)) +#loc201 = loc("tmp52"(#loc86)) +#loc202 = loc("tmp53"(#loc87)) +#loc203 = loc("tmp54"(#loc88)) +#loc204 = loc("tmp55"(#loc89)) +#loc205 = loc("tmp56"(#loc90)) +#loc206 = loc("tmp57"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..78153bcf0956e11ca07bb403bab4c65ac32caf71 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir @@ -0,0 +1,280 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":18:0) +#loc1 = loc(unknown) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:27) +#loc79 = loc("in_ptr0"(#loc)) +#loc80 = loc("out_ptr1"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("ks0"(#loc)) +#loc83 = loc("ks1"(#loc)) +#loc84 = loc("ks2"(#loc)) +#loc85 = loc("ks3"(#loc)) +#loc86 = loc("ks4"(#loc)) +#loc87 = loc("ks5"(#loc)) +#loc88 = loc("xnumel"(#loc)) +#loc89 = loc("r0_numel"(#loc)) +#loc149 = loc("tmp46"(#loc63)) +#loc164 = loc(callsite(#loc1 at #loc149)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x1024xi32, #blocked> loc(#loc1) + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x1024xi1, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x1024xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x1024xi32, #blocked> loc(#loc92) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97) + %tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc159) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc99) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100) + %tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc160) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc102) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc103) + %tmp10_10 = tt.splat %xmask : i1 -> tensor<1x1024xi1, #blocked> loc(#loc161) + %tmp10_11 = tt.splat %tmp10 : !tt.ptr -> tensor<1x1024x!tt.ptr, #blocked> loc(#loc105) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc106) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107) + %tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1, #blocked> loc(#loc107) + %tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108) + %tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc162) + %_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x1024xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x1024xi32, #blocked> loc(#loc111) + %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32, #blocked> loc(#loc111) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x1024xi32, #blocked> loc(#loc112) + %r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc113) + %r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc114) + %tmp0_17 = arith.extsi %r0_4 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x1024xi64, #blocked> loc(#loc99) + %tmp3_20 = arith.extsi %r0_3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x1024xi64, #blocked> loc(#loc102) + %tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x1024xi1, #blocked> loc(#loc115) + %tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x1024xi64, #blocked> loc(#loc116) + %tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1, #blocked> loc(#loc117) + %tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc104) + %tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr, #blocked> loc(#loc105) + %tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1, #blocked> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1, #blocked> loc(#loc121) + %tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc122) + %tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc123) + %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc124) + %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc125) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc126) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1, #blocked> loc(#loc127) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc128) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc129) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc130) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1, #blocked> loc(#loc131) + %tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32, #blocked> loc(#loc132) + %tmp30_27 = arith.extsi %tmp30_26 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc133) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc134) + %tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc135) + %tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc136) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc137) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1, #blocked> loc(#loc138) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc139) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc140) + %tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc141) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1, #blocked> loc(#loc142) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1, #blocked> loc(#loc143) + %tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi1, #blocked> loc(#loc144) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc145) + %tmp47 = arith.addi %arg12, %tmp44 : tensor<1x1024xi64, #blocked> loc(#loc146) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc147) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc148) + scf.yield %_tmp46_31 : tensor<1x1024xi64, #blocked> loc(#loc61) + } loc(#loc110) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))): + %tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167) + tt.reduce.return %tmp46_17 : i64 loc(#loc163) + }) : (tensor<1x1024xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163) + %tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150) + %tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151) + %tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165) + %tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc74) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75) + tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc76) + %4 = tt.splat %3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.return loc(#loc78) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:26) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:22) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":41:22) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":44:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:94) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:77) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":56:37) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":62:92) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":32:40) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":33:31) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":34:29) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":37:27) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":38:27) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":45:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":48:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":50:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":51:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":52:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":53:23) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":58:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":60:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":61:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":70:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":71:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":72:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":73:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":74:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":75:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":76:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":78:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":79:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":80:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":82:38) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":83:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":85:25) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:36) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:8) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:30) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":89:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":91:20) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":92:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":94:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":93:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":95:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":97:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":96:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:4) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xmask"(#loc3)) +#loc92 = loc("r0_base"(#loc4)) +#loc93 = loc("x1"(#loc5)) +#loc94 = loc("x1"(#loc6)) +#loc95 = loc("x0"(#loc7)) +#loc96 = loc("x2"(#loc8)) +#loc97 = loc("tmp0"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp2"(#loc11)) +#loc100 = loc("tmp3"(#loc12)) +#loc101 = loc("tmp3"(#loc13)) +#loc102 = loc("tmp5"(#loc14)) +#loc103 = loc("tmp10"(#loc15)) +#loc104 = loc("tmp10"(#loc16)) +#loc105 = loc("tmp10"(#loc17)) +#loc106 = loc("tmp17"(#loc18)) +#loc107 = loc("tmp23"(#loc19)) +#loc108 = loc("tmp30"(#loc20)) +#loc109 = loc("tmp30"(#loc21)) +#loc110 = loc("_tmp46"(#loc22)) +#loc111 = loc("r0_index"(#loc23)) +#loc112 = loc("r0_mask"(#loc24)) +#loc113 = loc("r0_4"(#loc25)) +#loc114 = loc("r0_3"(#loc26)) +#loc115 = loc("tmp6"(#loc27)) +#loc116 = loc("tmp9"(#loc28)) +#loc117 = loc("tmp10"(#loc29)) +#loc118 = loc("tmp11"(#loc30)) +#loc119 = loc("tmp12"(#loc31)) +#loc120 = loc("tmp13"(#loc32)) +#loc121 = loc("tmp14"(#loc33)) +#loc122 = loc("tmp18"(#loc34)) +#loc123 = loc("tmp19"(#loc35)) +#loc124 = loc("tmp21"(#loc36)) +#loc125 = loc("tmp22"(#loc37)) +#loc126 = loc("tmp24"(#loc38)) +#loc127 = loc("tmp25"(#loc39)) +#loc128 = loc("tmp26"(#loc40)) +#loc129 = loc("tmp27"(#loc41)) +#loc130 = loc("tmp28"(#loc42)) +#loc131 = loc("tmp29"(#loc43)) +#loc132 = loc("tmp30"(#loc44)) +#loc133 = loc("tmp30"(#loc45)) +#loc134 = loc("tmp31"(#loc46)) +#loc135 = loc("tmp32"(#loc47)) +#loc136 = loc("tmp33"(#loc48)) +#loc137 = loc("tmp34"(#loc49)) +#loc138 = loc("tmp35"(#loc50)) +#loc139 = loc("tmp36"(#loc51)) +#loc140 = loc("tmp37"(#loc52)) +#loc141 = loc("tmp39"(#loc53)) +#loc142 = loc("tmp40"(#loc54)) +#loc143 = loc("tmp41"(#loc55)) +#loc144 = loc("tmp43"(#loc56)) +#loc145 = loc("tmp44"(#loc57)) +#loc146 = loc("tmp47"(#loc58)) +#loc147 = loc("_tmp46"(#loc59)) +#loc148 = loc("_tmp46"(#loc60)) +#loc150 = loc("tmp46"(#loc65)) +#loc151 = loc("tmp49"(#loc66)) +#loc152 = loc("tmp51"(#loc67)) +#loc153 = loc("tmp52"(#loc68)) +#loc154 = loc("tmp54"(#loc69)) +#loc155 = loc("tmp53"(#loc70)) +#loc156 = loc("tmp55"(#loc71)) +#loc157 = loc("tmp57"(#loc72)) +#loc158 = loc("tmp56"(#loc73)) +#loc159 = loc(fused[#loc98, #loc97]) +#loc160 = loc(fused[#loc101, #loc100]) +#loc161 = loc(fused[#loc104, #loc91]) +#loc162 = loc(fused[#loc109, #loc108]) +#loc163 = loc(callsite(#loc62 at #loc149)) +#loc165 = loc(fused[#loc154, #loc155]) +#loc166 = loc(fused[#loc157, #loc158]) +#loc167 = loc(callsite(#loc64 at #loc163)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d0d58d9eeb306d1a4655afa9eebab4bf88d8df84 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":18:0) +#loc1 = loc(unknown) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:27) +#loc81 = loc("in_ptr0"(#loc)) +#loc82 = loc("out_ptr1"(#loc)) +#loc83 = loc("out_ptr2"(#loc)) +#loc84 = loc("ks0"(#loc)) +#loc85 = loc("ks1"(#loc)) +#loc86 = loc("ks2"(#loc)) +#loc87 = loc("ks3"(#loc)) +#loc88 = loc("ks4"(#loc)) +#loc89 = loc("ks5"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc153 = loc("tmp46"(#loc65)) +#loc168 = loc(callsite(#loc1 at #loc153)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92) + %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_0 = arith.constant dense : tensor<1x1024xi1> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc1) + %cst_2 = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc93) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94) + %xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc95) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc96) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100) + %_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc102) + %r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32> loc(#loc102) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x1024xi32> loc(#loc103) + %r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc104) + %r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc105) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106) + %tmp0_11 = arith.extsi %r0_4 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc107) + %tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64> loc(#loc163) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x1024xi64> loc(#loc107) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc108) + %tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x1024xi64> loc(#loc108) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109) + %tmp3_15 = arith.extsi %r0_3 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc110) + %tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64> loc(#loc164) + %tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x1024xi64> loc(#loc110) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc111) + %tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x1024xi64> loc(#loc111) + %tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x1024xi1> loc(#loc112) + %tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x1024xi64> loc(#loc113) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc114) + %tmp10_19 = tt.splat %tmp10 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc114) + %tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1> loc(#loc115) + %tmp10_21 = tt.splat %xmask : i1 -> tensor<1x1024xi1> loc(#loc165) + %tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x1024xi1> loc(#loc116) + %tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc117) + %tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x1024xi64> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x1024xi64> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc121) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc122) + %tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc123) + %tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc124) + %tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc125) + %tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc126) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127) + %tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1> loc(#loc127) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x1024xi1> loc(#loc128) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1> loc(#loc129) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc130) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc131) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x1024xi64> loc(#loc132) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc133) + %tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32> loc(#loc134) + %tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135) + %tmp30_26 = arith.extsi %tmp30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc136) + %tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x1024xi64> loc(#loc166) + %tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x1024xi64> loc(#loc136) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x1024xi64> loc(#loc137) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64> loc(#loc138) + %tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc139) + %tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc140) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x1024xi1> loc(#loc141) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1> loc(#loc142) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc143) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc144) + %tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x1024xi64> loc(#loc145) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1> loc(#loc146) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1> loc(#loc147) + %tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc148) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc149) + %tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x1024xi64> loc(#loc150) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x1024xi1> loc(#loc151) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc152) + scf.yield %_tmp46_31 : tensor<1x1024xi64> loc(#loc63) + } loc(#loc101) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))): + %tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171) + tt.reduce.return %tmp46_11 : i64 loc(#loc167) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc167) + %tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154) + %tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155) + %tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169) + %tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc76) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc76) + tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc77) + %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc78) + %3 = tt.splat %2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc78) + tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc79) + tt.return loc(#loc80) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":32:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":90:35) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":24:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":28:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":29:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":37:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":38:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":44:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":45:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":48:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:94) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:77) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":50:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":51:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":52:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":53:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":56:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":57:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":58:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":60:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":61:92) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":62:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:38) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:51) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":70:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":71:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":72:92) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":73:25) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":74:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":75:24) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":76:39) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":78:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":79:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":80:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":82:38) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":83:25) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":85:25) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:50) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:8) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:30) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":89:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":91:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":92:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":94:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":93:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":95:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":97:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":96:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:25) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:37) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:4) +#loc92 = loc("tmp50"(#loc3)) +#loc93 = loc("xoffset"(#loc4)) +#loc94 = loc("xmask"(#loc5)) +#loc95 = loc("r0_base"(#loc6)) +#loc96 = loc("r0_base"(#loc7)) +#loc97 = loc("x1"(#loc8)) +#loc98 = loc("x1"(#loc9)) +#loc99 = loc("x0"(#loc10)) +#loc100 = loc("x2"(#loc11)) +#loc101 = loc("_tmp46"(#loc2)) +#loc102 = loc("r0_index"(#loc12)) +#loc103 = loc("r0_mask"(#loc13)) +#loc104 = loc("r0_4"(#loc14)) +#loc105 = loc("r0_3"(#loc15)) +#loc106 = loc("tmp0"(#loc16)) +#loc107 = loc("tmp0"(#loc17)) +#loc108 = loc("tmp2"(#loc18)) +#loc109 = loc("tmp3"(#loc19)) +#loc110 = loc("tmp3"(#loc20)) +#loc111 = loc("tmp5"(#loc21)) +#loc112 = loc("tmp6"(#loc22)) +#loc113 = loc("tmp9"(#loc23)) +#loc114 = loc("tmp10"(#loc24)) +#loc115 = loc("tmp10"(#loc25)) +#loc116 = loc("tmp10"(#loc26)) +#loc117 = loc("tmp10"(#loc27)) +#loc118 = loc("tmp11"(#loc28)) +#loc119 = loc("tmp12"(#loc29)) +#loc120 = loc("tmp13"(#loc30)) +#loc121 = loc("tmp14"(#loc31)) +#loc122 = loc("tmp17"(#loc32)) +#loc123 = loc("tmp18"(#loc33)) +#loc124 = loc("tmp19"(#loc34)) +#loc125 = loc("tmp21"(#loc35)) +#loc126 = loc("tmp22"(#loc36)) +#loc127 = loc("tmp23"(#loc37)) +#loc128 = loc("tmp24"(#loc38)) +#loc129 = loc("tmp25"(#loc39)) +#loc130 = loc("tmp26"(#loc40)) +#loc131 = loc("tmp27"(#loc41)) +#loc132 = loc("tmp28"(#loc42)) +#loc133 = loc("tmp29"(#loc43)) +#loc134 = loc("tmp30"(#loc44)) +#loc135 = loc("tmp30"(#loc45)) +#loc136 = loc("tmp30"(#loc46)) +#loc137 = loc("tmp30"(#loc47)) +#loc138 = loc("tmp31"(#loc48)) +#loc139 = loc("tmp32"(#loc49)) +#loc140 = loc("tmp33"(#loc50)) +#loc141 = loc("tmp34"(#loc51)) +#loc142 = loc("tmp35"(#loc52)) +#loc143 = loc("tmp36"(#loc53)) +#loc144 = loc("tmp37"(#loc54)) +#loc145 = loc("tmp39"(#loc55)) +#loc146 = loc("tmp40"(#loc56)) +#loc147 = loc("tmp41"(#loc57)) +#loc148 = loc("tmp43"(#loc58)) +#loc149 = loc("tmp44"(#loc59)) +#loc150 = loc("tmp47"(#loc60)) +#loc151 = loc("_tmp46"(#loc61)) +#loc152 = loc("_tmp46"(#loc62)) +#loc154 = loc("tmp46"(#loc67)) +#loc155 = loc("tmp49"(#loc68)) +#loc156 = loc("tmp51"(#loc69)) +#loc157 = loc("tmp52"(#loc70)) +#loc158 = loc("tmp54"(#loc71)) +#loc159 = loc("tmp53"(#loc72)) +#loc160 = loc("tmp55"(#loc73)) +#loc161 = loc("tmp57"(#loc74)) +#loc162 = loc("tmp56"(#loc75)) +#loc163 = loc(fused[#loc107, #loc106]) +#loc164 = loc(fused[#loc110, #loc109]) +#loc165 = loc(fused[#loc116, #loc94]) +#loc166 = loc(fused[#loc136, #loc135]) +#loc167 = loc(callsite(#loc64 at #loc153)) +#loc169 = loc(fused[#loc158, #loc159]) +#loc170 = loc(fused[#loc161, #loc162]) +#loc171 = loc(callsite(#loc66 at #loc167)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4aa56bd5d086fdf4fff3480e9d0945d7d0dfcbee --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..db63e12618867d09d9d5cad1f8c4c6f77111b103 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d148fe77f7ed5dfed5ec9d0346e7b78ee43e0163 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "20789ed347536015de365783d5f81371fbd0587b607eff6d47e815134479247e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..e89865a4b84908b042b07830a44508a635610cf3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,393 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = icmp samesign ult i32 %8, 32, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 15, !dbg !9 + %12 = and i32 %8, 15, !dbg !10 + %13 = lshr i32 %8, 4, !dbg !11 + %14 = mul nuw nsw i32 %11, 17, !dbg !12 + %15 = add nuw nsw i32 %14, %12, !dbg !13 + %16 = mul i32 %13, 272, !dbg !14 + %17 = add i32 %15, %16, !dbg !15 + %18 = sext i32 %17 to i64, !dbg !16 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %18, !dbg !16 + %20 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %22 = lshr i32 %10, 1, !dbg !18 + %.lobit = and i32 %22, 1, !dbg !18 + %23 = and i32 %10, 1, !dbg !18 + %24 = lshr i32 %10, 2, !dbg !18 + %.lobit1 = and i32 %24, 1, !dbg !18 + %25 = lshr i32 %10, 3, !dbg !18 + %.lobit2 = and i32 %25, 1, !dbg !18 + %26 = xor i32 %23, 1, !dbg !22 + %27 = xor i32 %.lobit, 1, !dbg !22 + %28 = xor i32 %.lobit1, 1, !dbg !22 + %29 = xor i32 %.lobit2, 1, !dbg !22 + %30 = mul nuw nsw i32 %20, %26, !dbg !23 + %31 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %30, i32 1, i32 31), !dbg !24 + %32 = add i32 %31, %30, !dbg !27 + %33 = mul nuw nsw i32 %20, %23, !dbg !28 + %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 1, i32 31), !dbg !24 + %35 = add i32 %34, %33, !dbg !27 + %36 = mul nuw nsw i32 %26, %11, !dbg !29 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 1, i32 31), !dbg !24 + %38 = add i32 %37, %36, !dbg !27 + %39 = mul nuw nsw i32 %11, %23, !dbg !30 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !24 + %41 = add i32 %40, %39, !dbg !27 + %42 = trunc i32 %22 to i1, !dbg !31 + %43 = icmp sge i32 %32, %35, !dbg !31 + %44 = icmp ne i32 %32, %35, !dbg !31 + %45 = icmp sle i32 %38, %41, !dbg !31 + %46 = or i1 %44, %45, !dbg !31 + %47 = and i1 %43, %46, !dbg !31 + %.not = xor i1 %47, %42, !dbg !31 + %48 = xor i32 %35, %32, !dbg !32 + %49 = select i1 %.not, i32 0, i32 %48, !dbg !33 + %50 = xor i32 %49, %20, !dbg !34 + %51 = xor i32 %41, %38, !dbg !35 + %52 = select i1 %.not, i32 0, i32 %51, !dbg !36 + %53 = xor i32 %52, %11, !dbg !37 + %54 = mul nuw nsw i32 %50, %27, !dbg !23 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 2, i32 31), !dbg !24 + %56 = add i32 %54, %55, !dbg !27 + %57 = mul nuw nsw i32 %50, %.lobit, !dbg !28 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !24 + %59 = add i32 %57, %58, !dbg !27 + %60 = mul nuw nsw i32 %53, %27, !dbg !29 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !24 + %62 = add i32 %60, %61, !dbg !27 + %63 = mul nuw nsw i32 %53, %.lobit, !dbg !30 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !24 + %65 = add i32 %63, %64, !dbg !27 + %66 = trunc i32 %24 to i1, !dbg !31 + %67 = icmp sge i32 %56, %59, !dbg !31 + %68 = icmp ne i32 %56, %59, !dbg !31 + %69 = icmp sle i32 %62, %65, !dbg !31 + %70 = or i1 %68, %69, !dbg !31 + %71 = and i1 %67, %70, !dbg !31 + %.not3 = xor i1 %71, %66, !dbg !31 + %72 = xor i32 %56, %59, !dbg !32 + %73 = select i1 %.not3, i32 0, i32 %72, !dbg !33 + %74 = xor i32 %73, %50, !dbg !34 + %75 = xor i32 %62, %65, !dbg !35 + %76 = select i1 %.not3, i32 0, i32 %75, !dbg !36 + %77 = xor i32 %76, %53, !dbg !37 + %78 = mul nuw nsw i32 %74, %26, !dbg !23 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !24 + %80 = add i32 %78, %79, !dbg !27 + %81 = mul nuw nsw i32 %74, %23, !dbg !28 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 1, i32 31), !dbg !24 + %83 = add i32 %81, %82, !dbg !27 + %84 = mul nuw nsw i32 %77, %26, !dbg !29 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !24 + %86 = add i32 %84, %85, !dbg !27 + %87 = mul nuw nsw i32 %77, %23, !dbg !30 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !24 + %89 = add i32 %87, %88, !dbg !27 + %90 = icmp sge i32 %80, %83, !dbg !31 + %91 = icmp ne i32 %80, %83, !dbg !31 + %92 = icmp sle i32 %86, %89, !dbg !31 + %93 = or i1 %91, %92, !dbg !31 + %94 = and i1 %90, %93, !dbg !31 + %.not4 = xor i1 %94, %66, !dbg !31 + %95 = xor i32 %80, %83, !dbg !32 + %96 = select i1 %.not4, i32 0, i32 %95, !dbg !33 + %97 = xor i32 %96, %74, !dbg !34 + %98 = xor i32 %86, %89, !dbg !35 + %99 = select i1 %.not4, i32 0, i32 %98, !dbg !36 + %100 = xor i32 %99, %77, !dbg !37 + %101 = mul nuw nsw i32 %97, %28, !dbg !23 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 4, i32 31), !dbg !24 + %103 = add i32 %101, %102, !dbg !27 + %104 = mul nuw nsw i32 %97, %.lobit1, !dbg !28 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !24 + %106 = add i32 %104, %105, !dbg !27 + %107 = mul nuw nsw i32 %100, %28, !dbg !29 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !24 + %109 = add i32 %107, %108, !dbg !27 + %110 = mul nuw nsw i32 %100, %.lobit1, !dbg !30 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !24 + %112 = add i32 %110, %111, !dbg !27 + %113 = trunc i32 %25 to i1, !dbg !31 + %114 = icmp sge i32 %103, %106, !dbg !31 + %115 = icmp ne i32 %103, %106, !dbg !31 + %116 = icmp sle i32 %109, %112, !dbg !31 + %117 = or i1 %115, %116, !dbg !31 + %118 = and i1 %114, %117, !dbg !31 + %.not5 = xor i1 %118, %113, !dbg !31 + %119 = xor i32 %103, %106, !dbg !32 + %120 = select i1 %.not5, i32 0, i32 %119, !dbg !33 + %121 = xor i32 %120, %97, !dbg !34 + %122 = xor i32 %109, %112, !dbg !35 + %123 = select i1 %.not5, i32 0, i32 %122, !dbg !36 + %124 = xor i32 %123, %100, !dbg !37 + %125 = mul nuw nsw i32 %121, %27, !dbg !23 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 2, i32 31), !dbg !24 + %127 = add i32 %125, %126, !dbg !27 + %128 = mul nuw nsw i32 %121, %.lobit, !dbg !28 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !24 + %130 = add i32 %128, %129, !dbg !27 + %131 = mul nuw nsw i32 %124, %27, !dbg !29 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !24 + %133 = add i32 %131, %132, !dbg !27 + %134 = mul nuw nsw i32 %124, %.lobit, !dbg !30 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !24 + %136 = add i32 %134, %135, !dbg !27 + %137 = icmp sge i32 %127, %130, !dbg !31 + %138 = icmp ne i32 %127, %130, !dbg !31 + %139 = icmp sle i32 %133, %136, !dbg !31 + %140 = or i1 %138, %139, !dbg !31 + %141 = and i1 %137, %140, !dbg !31 + %.not6 = xor i1 %141, %113, !dbg !31 + %142 = xor i32 %127, %130, !dbg !32 + %143 = select i1 %.not6, i32 0, i32 %142, !dbg !33 + %144 = xor i32 %143, %121, !dbg !34 + %145 = xor i32 %133, %136, !dbg !35 + %146 = select i1 %.not6, i32 0, i32 %145, !dbg !36 + %147 = xor i32 %146, %124, !dbg !37 + %148 = mul nuw nsw i32 %144, %26, !dbg !23 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 1, i32 31), !dbg !24 + %150 = add i32 %148, %149, !dbg !27 + %151 = mul nuw nsw i32 %144, %23, !dbg !28 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 1, i32 31), !dbg !24 + %153 = add i32 %151, %152, !dbg !27 + %154 = mul nuw nsw i32 %147, %26, !dbg !29 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !24 + %156 = add i32 %154, %155, !dbg !27 + %157 = mul nuw nsw i32 %147, %23, !dbg !30 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !24 + %159 = add i32 %157, %158, !dbg !27 + %160 = icmp sge i32 %150, %153, !dbg !31 + %161 = icmp ne i32 %150, %153, !dbg !31 + %162 = icmp sle i32 %156, %159, !dbg !31 + %163 = or i1 %161, %162, !dbg !31 + %164 = and i1 %160, %163, !dbg !31 + %.not7 = xor i1 %164, %113, !dbg !31 + %165 = xor i32 %150, %153, !dbg !32 + %166 = select i1 %.not7, i32 0, i32 %165, !dbg !33 + %167 = xor i32 %166, %144, !dbg !34 + %168 = xor i32 %156, %159, !dbg !35 + %169 = select i1 %.not7, i32 0, i32 %168, !dbg !36 + %170 = xor i32 %169, %147, !dbg !37 + %171 = mul nuw nsw i32 %167, %29, !dbg !23 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 8, i32 31), !dbg !24 + %173 = add i32 %171, %172, !dbg !27 + %174 = mul nuw nsw i32 %167, %.lobit2, !dbg !28 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 8, i32 31), !dbg !24 + %176 = add i32 %174, %175, !dbg !27 + %177 = mul nuw nsw i32 %170, %29, !dbg !29 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !24 + %179 = add i32 %177, %178, !dbg !27 + %180 = mul nuw nsw i32 %170, %.lobit2, !dbg !30 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !24 + %182 = add i32 %180, %181, !dbg !27 + %183 = icmp slt i32 %173, %176, !dbg !38 + %184 = icmp eq i32 %173, %176, !dbg !39 + %185 = icmp sgt i32 %179, %182, !dbg !40 + %186 = and i1 %184, %185, !dbg !41 + %187 = or i1 %183, %186, !dbg !42 + %188 = xor i32 %173, %176, !dbg !32 + %189 = select i1 %187, i32 %188, i32 0, !dbg !33 + %190 = xor i32 %189, %167, !dbg !34 + %191 = xor i32 %179, %182, !dbg !35 + %192 = select i1 %187, i32 %191, i32 0, !dbg !36 + %193 = xor i32 %192, %170, !dbg !37 + %194 = mul nuw nsw i32 %190, %28, !dbg !23 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !24 + %196 = add i32 %194, %195, !dbg !27 + %197 = mul nuw nsw i32 %190, %.lobit1, !dbg !28 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !24 + %199 = add i32 %197, %198, !dbg !27 + %200 = mul nuw nsw i32 %193, %28, !dbg !29 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !24 + %202 = add i32 %200, %201, !dbg !27 + %203 = mul nuw nsw i32 %193, %.lobit1, !dbg !30 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !24 + %205 = add i32 %203, %204, !dbg !27 + %206 = icmp slt i32 %196, %199, !dbg !38 + %207 = icmp eq i32 %196, %199, !dbg !39 + %208 = icmp sgt i32 %202, %205, !dbg !40 + %209 = and i1 %207, %208, !dbg !41 + %210 = or i1 %206, %209, !dbg !42 + %211 = xor i32 %196, %199, !dbg !32 + %212 = select i1 %210, i32 %211, i32 0, !dbg !33 + %213 = xor i32 %212, %190, !dbg !34 + %214 = xor i32 %202, %205, !dbg !35 + %215 = select i1 %210, i32 %214, i32 0, !dbg !36 + %216 = xor i32 %215, %193, !dbg !37 + %217 = mul nuw nsw i32 %213, %27, !dbg !23 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 2, i32 31), !dbg !24 + %219 = add i32 %217, %218, !dbg !27 + %220 = mul nuw nsw i32 %213, %.lobit, !dbg !28 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !24 + %222 = add i32 %220, %221, !dbg !27 + %223 = mul nuw nsw i32 %216, %27, !dbg !29 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !24 + %225 = add i32 %223, %224, !dbg !27 + %226 = mul nuw nsw i32 %216, %.lobit, !dbg !30 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !24 + %228 = add i32 %226, %227, !dbg !27 + %229 = icmp slt i32 %219, %222, !dbg !38 + %230 = icmp eq i32 %219, %222, !dbg !39 + %231 = icmp sgt i32 %225, %228, !dbg !40 + %232 = and i1 %230, %231, !dbg !41 + %233 = or i1 %229, %232, !dbg !42 + %234 = xor i32 %219, %222, !dbg !32 + %235 = select i1 %233, i32 %234, i32 0, !dbg !33 + %236 = xor i32 %235, %213, !dbg !34 + %237 = xor i32 %225, %228, !dbg !35 + %238 = select i1 %233, i32 %237, i32 0, !dbg !36 + %239 = xor i32 %238, %216, !dbg !37 + %240 = mul nuw nsw i32 %236, %26, !dbg !23 + %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 1, i32 31), !dbg !24 + %242 = add i32 %240, %241, !dbg !27 + %243 = mul nuw nsw i32 %236, %23, !dbg !28 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !24 + %245 = add i32 %243, %244, !dbg !27 + %246 = mul nuw nsw i32 %239, %26, !dbg !29 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !24 + %248 = add i32 %246, %247, !dbg !27 + %249 = mul nuw nsw i32 %239, %23, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !24 + %251 = add i32 %249, %250, !dbg !27 + %252 = icmp slt i32 %242, %245, !dbg !38 + %253 = icmp eq i32 %242, %245, !dbg !39 + %254 = icmp sgt i32 %248, %251, !dbg !40 + %255 = and i1 %253, %254, !dbg !41 + %256 = or i1 %252, %255, !dbg !42 + %257 = xor i32 %248, %251, !dbg !35 + %258 = select i1 %256, i32 %257, i32 0, !dbg !36 + %259 = xor i32 %258, %239, !dbg !37 + %narrow = select i1 %9, i32 %21, i32 0, !dbg !43 + %260 = sext i32 %narrow to i64, !dbg !43 + %261 = ashr i32 %narrow, 31, !dbg !44 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %narrow, i32 8, i32 31), !dbg !44 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 8, i32 31), !dbg !44 + %264 = insertelement <2 x i32> poison, i32 %262, i64 0, !dbg !44 + %265 = insertelement <2 x i32> %264, i32 %263, i64 1, !dbg !44 + %266 = bitcast <2 x i32> %265 to i64, !dbg !44 + %267 = add i64 %266, %260, !dbg !46 + %extelt.offset = lshr i64 %267, 32, !dbg !44 + %268 = trunc nuw i64 %extelt.offset to i32, !dbg !44 + %269 = trunc i64 %267 to i32, !dbg !44 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !44 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !44 + %272 = insertelement <2 x i32> poison, i32 %270, i64 0, !dbg !44 + %273 = insertelement <2 x i32> %272, i32 %271, i64 1, !dbg !44 + %274 = bitcast <2 x i32> %273 to i64, !dbg !44 + %275 = add i64 %267, %274, !dbg !46 + %extelt.offset8 = lshr i64 %275, 32, !dbg !44 + %276 = trunc nuw i64 %extelt.offset8 to i32, !dbg !44 + %277 = trunc i64 %275 to i32, !dbg !44 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 2, i32 31), !dbg !44 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !44 + %280 = insertelement <2 x i32> poison, i32 %278, i64 0, !dbg !44 + %281 = insertelement <2 x i32> %280, i32 %279, i64 1, !dbg !44 + %282 = bitcast <2 x i32> %281 to i64, !dbg !44 + %283 = add i64 %275, %282, !dbg !46 + %extelt.offset9 = lshr i64 %283, 32, !dbg !44 + %284 = trunc nuw i64 %extelt.offset9 to i32, !dbg !44 + %285 = trunc i64 %283 to i32, !dbg !44 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 1, i32 31), !dbg !44 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 1, i32 31), !dbg !44 + %288 = insertelement <2 x i32> poison, i32 %286, i64 0, !dbg !44 + %289 = insertelement <2 x i32> %288, i32 %287, i64 1, !dbg !44 + %290 = bitcast <2 x i32> %289 to i64, !dbg !44 + %291 = add i64 %283, %290, !dbg !46 + %292 = trunc i64 %291 to i32, !dbg !47 + %293 = shl i32 %8, 4, !dbg !48 + %294 = or disjoint i32 %11, %293, !dbg !49 + %295 = sext i32 %294 to i64, !dbg !50 + %296 = getelementptr i32, ptr addrspace(1) %1, i64 %295, !dbg !50 + %297 = and i32 %10, 48, !dbg !51 + %298 = icmp eq i32 %297, 0, !dbg !51 + %299 = and i1 %9, %298, !dbg !51 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %259, ptr addrspace(1) %296, i1 %299) #3, !dbg !51 + %300 = zext nneg i32 %8 to i64, !dbg !52 + %301 = getelementptr i32, ptr addrspace(1) %2, i64 %300, !dbg !52 + %302 = and i32 %10, 63, !dbg !53 + %303 = icmp eq i32 %302, 0, !dbg !53 + %304 = and i1 %9, %303, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %292, ptr addrspace(1) %301, i1 %304) #3, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 21, scope: !4) +!9 = !DILocation(line: 27, column: 38, scope: !4) +!10 = !DILocation(line: 33, column: 19, scope: !4) +!11 = !DILocation(line: 34, column: 19, scope: !4) +!12 = !DILocation(line: 36, column: 38, scope: !4) +!13 = !DILocation(line: 36, column: 35, scope: !4) +!14 = !DILocation(line: 36, column: 49, scope: !4) +!15 = !DILocation(line: 36, column: 45, scope: !4) +!16 = !DILocation(line: 36, column: 30, scope: !4) +!17 = !DILocation(line: 36, column: 54, scope: !4) +!18 = !DILocation(line: 627, column: 44, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 41, column: 67, scope: !4) +!22 = !DILocation(line: 537, column: 21, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 538, column: 40, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !21) +!25 = distinct !DILexicalBlockFile(scope: !4, file: !26, discriminator: 0) +!26 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!27 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !21) +!28 = !DILocation(line: 539, column: 41, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 548, column: 23, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 551, column: 23, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 599, column: 28, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 600, column: 38, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 600, column: 46, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 600, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 601, column: 48, scope: !19, inlinedAt: !21) +!36 = !DILocation(line: 601, column: 59, scope: !19, inlinedAt: !21) +!37 = !DILocation(line: 601, column: 22, scope: !19, inlinedAt: !21) +!38 = !DILocation(line: 574, column: 22, scope: !19, inlinedAt: !21) +!39 = !DILocation(line: 591, column: 21, scope: !19, inlinedAt: !21) +!40 = !DILocation(line: 594, column: 40, scope: !19, inlinedAt: !21) +!41 = !DILocation(line: 594, column: 29, scope: !19, inlinedAt: !21) +!42 = !DILocation(line: 594, column: 23, scope: !19, inlinedAt: !21) +!43 = !DILocation(line: 44, column: 34, scope: !4) +!44 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !45) +!45 = !DILocation(line: 45, column: 26, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !45) +!47 = !DILocation(line: 48, column: 21, scope: !4) +!48 = !DILocation(line: 49, column: 35, scope: !4) +!49 = !DILocation(line: 49, column: 32, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 47, scope: !4) +!52 = !DILocation(line: 50, column: 25, scope: !4) +!53 = !DILocation(line: 50, column: 37, scope: !4) +!54 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6105b6226d45ccd001902679811f96c27203be1a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,863 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 64 +{ + .reg .pred %p<64>; + .reg .b32 %r<224>; + .reg .b64 %rd<26>; + .loc 1 18 0 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:18:0 + +// %bb.0: + ld.param.b64 %rd5, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:24:28 + mov.u32 %r5, %ctaid.x; + .loc 1 26 21 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:26:21 + setp.lt.u32 %p1, %r5, 32; + ld.param.b64 %rd7, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 27 38 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:27:38 + mov.u32 %r6, %tid.x; + and.b32 %r7, %r6, 15; + .loc 1 33 19 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:33:19 + and.b32 %r8, %r5, 15; + .loc 1 34 19 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:34:19 + shr.u32 %r9, %r5, 4; + .loc 1 36 35 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:36:35 + mad.lo.s32 %r10, %r7, 17, %r8; + .loc 1 36 45 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:36:45 + mad.lo.s32 %r11, %r9, 272, %r10; + .loc 1 36 30 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:36:30 + mad.wide.s32 %rd1, %r11, 4, %rd5; + .loc 1 36 54 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shr.u32 %r12, %r6, 1; + bfe.u32 %r13, %r6, 1, 1; + and.b32 %r14, %r6, 1; + shr.u32 %r15, %r6, 2; + bfe.u32 %r16, %r6, 2, 1; + shr.u32 %r17, %r6, 3; + bfe.u32 %r18, %r6, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r19, %r14, 1; + xor.b32 %r20, %r13, 1; + xor.b32 %r21, %r16, 1; + xor.b32 %r22, %r18, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r23, %r1, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r24, %r23, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r25, %r24, %r23; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r26, %r1, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r27, %r26, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r28, %r27, %r26; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r29, %r19, %r7; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r31, %r30, %r29; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r32, %r7, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r34, %r33, %r32; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + and.b32 %r35, %r12, 1; + setp.ne.b32 %p5, %r35, 0; + setp.ge.s32 %p6, %r25, %r28; + setp.ne.b32 %p7, %r25, %r28; + setp.le.s32 %p8, %r31, %r34; + or.pred %p9, %p7, %p8; + and.pred %p10, %p6, %p9; + xor.pred %p11, %p10, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r36, %r28, %r25; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r37, 0, %r36, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r38, %r37, %r1; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r39, %r34, %r31; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r40, 0, %r39, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r41, %r40, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r42, %r38, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r43, %r42, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r44, %r42, %r43; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r45, %r38, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r47, %r45, %r46; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r48, %r41, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r49, %r48, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r50, %r48, %r49; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r51, %r41, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r52, %r51, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r53, %r51, %r52; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + and.b32 %r54, %r15, 1; + setp.ne.b32 %p12, %r54, 0; + setp.ge.s32 %p13, %r44, %r47; + setp.ne.b32 %p14, %r44, %r47; + setp.le.s32 %p15, %r50, %r53; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r55, %r44, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r56, 0, %r55, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r57, %r56, %r38; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r58, %r50, %r53; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r59, 0, %r58, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r60, %r59, %r41; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r61, %r57, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r63, %r61, %r62; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r64, %r57, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r66, %r64, %r65; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r67, %r60, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r68, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r69, %r67, %r68; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r70, %r60, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r72, %r70, %r71; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.ge.s32 %p19, %r63, %r66; + setp.ne.b32 %p20, %r63, %r66; + setp.le.s32 %p21, %r69, %r72; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r73, %r63, %r66; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r74, 0, %r73, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r75, %r74, %r57; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r76, %r69, %r72; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r77, 0, %r76, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r78, %r77, %r60; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r79, %r75, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r80, %r79, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r81, %r79, %r80; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r82, %r75, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r84, %r82, %r83; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r85, %r78, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r86, %r85, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r87, %r85, %r86; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r88, %r78, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r90, %r88, %r89; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + and.b32 %r91, %r17, 1; + setp.ne.b32 %p25, %r91, 0; + setp.ge.s32 %p26, %r81, %r84; + setp.ne.b32 %p27, %r81, %r84; + setp.le.s32 %p28, %r87, %r90; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r92, %r81, %r84; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r93, 0, %r92, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r94, %r93, %r75; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r95, %r87, %r90; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r96, 0, %r95, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r97, %r96, %r78; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r98, %r94, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r100, %r98, %r99; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r101, %r94, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r103, %r101, %r102; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r104, %r97, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r105, %r104, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r106, %r104, %r105; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r107, %r97, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r108, %r107, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r109, %r107, %r108; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.ge.s32 %p32, %r100, %r103; + setp.ne.b32 %p33, %r100, %r103; + setp.le.s32 %p34, %r106, %r109; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r110, %r100, %r103; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r111, 0, %r110, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r112, %r111, %r94; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r113, %r106, %r109; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r114, 0, %r113, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r115, %r114, %r97; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r116, %r112, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r117, %r116, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r118, %r116, %r117; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r119, %r112, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r121, %r119, %r120; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r122, %r115, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r124, %r122, %r123; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r125, %r115, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r127, %r125, %r126; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.ge.s32 %p38, %r118, %r121; + setp.ne.b32 %p39, %r118, %r121; + setp.le.s32 %p40, %r124, %r127; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r128, %r118, %r121; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r129, 0, %r128, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r130, %r129, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r131, %r124, %r127; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r132, 0, %r131, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r133, %r132, %r115; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r134, %r130, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r136, %r134, %r135; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r137, %r130, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r138, %r137, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r139, %r137, %r138; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r140, %r133, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r141, %r140, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r142, %r140, %r141; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r143, %r133, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r144, %r143, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r145, %r143, %r144; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.lt.s32 %p44, %r136, %r139; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.eq.b32 %p45, %r136, %r139; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.gt.s32 %p46, %r142, %r145; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r146, %r136, %r139; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r147, %r146, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r148, %r147, %r130; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r149, %r142, %r145; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r150, %r149, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r151, %r150, %r133; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r152, %r148, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r153, %r152, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r154, %r152, %r153; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r155, %r148, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r156, %r155, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r157, %r155, %r156; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r158, %r151, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r159, %r158, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r160, %r158, %r159; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r161, %r151, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r162, %r161, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r163, %r161, %r162; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.lt.s32 %p49, %r154, %r157; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.eq.b32 %p50, %r154, %r157; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.gt.s32 %p51, %r160, %r163; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r164, %r154, %r157; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r165, %r164, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r166, %r165, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r167, %r160, %r163; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r168, %r167, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r169, %r168, %r151; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r170, %r166, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r171, %r170, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r172, %r170, %r171; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r173, %r166, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r174, %r173, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r175, %r173, %r174; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r176, %r169, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r177, %r176, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r178, %r176, %r177; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r179, %r169, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r180, %r179, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r181, %r179, %r180; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.lt.s32 %p54, %r172, %r175; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.eq.b32 %p55, %r172, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.gt.s32 %p56, %r178, %r181; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r182, %r172, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r183, %r182, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r184, %r183, %r166; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r185, %r178, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r186, %r185, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r187, %r186, %r169; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r188, %r184, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r189, %r188, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r190, %r188, %r189; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r191, %r184, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r193, %r191, %r192; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r194, %r187, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r195, %r194, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r196, %r194, %r195; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + mul.lo.s32 %r197, %r187, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + shfl.sync.bfly.b32 %r198, %r197, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + add.s32 %r199, %r197, %r198; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.lt.s32 %p59, %r190, %r193; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.eq.b32 %p60, %r190, %r193; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + setp.gt.s32 %p61, %r196, %r199; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r200, %r196, %r199; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + selp.b32 %r201, %r200, 0, %p61; + selp.b32 %r202, %r201, 0, %p60; + selp.b32 %r203, %r200, %r202, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:41:67 ] + xor.b32 %r3, %r203, %r187; +$L__tmp2: + .loc 1 44 34 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:44:34 + selp.b32 %r204, %r2, 0, %p1; + cvt.s64.s32 %rd8, %r204; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + shr.s32 %r205, %r204, 31; + shfl.sync.bfly.b32 %r206, %r204, 8, 31, -1; + shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1; + cvt.u64.u32 %rd9, %r206; + cvt.u64.u32 %rd10, %r207; + shl.b64 %rd11, %rd10, 32; + or.b64 %rd12, %rd9, %rd11; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + add.s64 %rd13, %rd12, %rd8; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + mov.b64 {_, %r208}, %rd13; + cvt.u32.u64 %r209, %rd13; + shfl.sync.bfly.b32 %r210, %r209, 4, 31, -1; + shfl.sync.bfly.b32 %r211, %r208, 4, 31, -1; + cvt.u64.u32 %rd14, %r210; + cvt.u64.u32 %rd15, %r211; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + add.s64 %rd18, %rd13, %rd17; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + mov.b64 {_, %r212}, %rd18; + cvt.u32.u64 %r213, %rd18; + shfl.sync.bfly.b32 %r214, %r213, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r212, 2, 31, -1; + cvt.u64.u32 %rd19, %r214; + cvt.u64.u32 %rd20, %r215; + shl.b64 %rd21, %rd20, 32; + or.b64 %rd22, %rd19, %rd21; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + add.s64 %rd23, %rd18, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + mov.b64 {_, %r216}, %rd23; + cvt.u32.u64 %r217, %rd23; + shfl.sync.bfly.b32 %r218, %r217, 1, 31, -1; + shfl.sync.bfly.b32 %r219, %r216, 1, 31, -1; + cvt.u64.u32 %rd24, %r218; + .loc 3 261 15 // standard.py:261:15 @[ cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:45:26 ] + add.s64 %rd25, %rd23, %rd24; +$L__tmp4: + .loc 1 48 21 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:48:21 + cvt.u32.u64 %r4, %rd25; + .loc 1 49 35 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:49:35 + shl.b32 %r220, %r5, 4; + .loc 1 49 32 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:49:32 + or.b32 %r221, %r7, %r220; + .loc 1 49 25 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:49:25 + mad.wide.s32 %rd3, %r221, 4, %rd6; + .loc 1 49 47 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:49:47 + and.b32 %r222, %r6, 48; + setp.eq.b32 %p62, %r222, 0; + and.pred %p3, %p1, %p62; + // begin inline asm + @%p3 st.global.b32 [ %rd3 + 0 ], { %r3 }; + // end inline asm + .loc 1 50 25 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:50:25 + mad.wide.u32 %rd4, %r5, 4, %rd7; + .loc 1 50 37 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:50:37 + and.b32 %r223, %r6, 63; + setp.eq.b32 %p63, %r223, 0; + and.pred %p4, %p1, %p63; + // begin inline asm + @%p4 st.global.b32 [ %rd4 + 0 ], { %r4 }; + // end inline asm + .loc 1 50 4 // cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 101 +.b8 99 +.b8 122 +.b8 52 +.b8 52 +.b8 51 +.b8 119 +.b8 110 +.b8 110 +.b8 101 +.b8 111 +.b8 103 +.b8 99 +.b8 54 +.b8 53 +.b8 111 +.b8 105 +.b8 99 +.b8 97 +.b8 117 +.b8 97 +.b8 117 +.b8 111 +.b8 121 +.b8 116 +.b8 119 +.b8 121 +.b8 55 +.b8 107 +.b8 54 +.b8 114 +.b8 121 +.b8 101 +.b8 121 +.b8 118 +.b8 50 +.b8 52 +.b8 108 +.b8 97 +.b8 99 +.b8 122 +.b8 109 +.b8 117 +.b8 120 +.b8 54 +.b8 112 +.b8 100 +.b8 105 +.b8 50 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 112 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..29bef016bf1a91e7e2b846f17c9fb3bca8fc431e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1216 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc87 = loc(unknown) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc139 = loc("in_ptr0"(#loc)) +#loc140 = loc("out_ptr2"(#loc)) +#loc141 = loc("out_ptr3"(#loc)) +#loc142 = loc("xnumel"(#loc)) +#loc143 = loc("r0_numel"(#loc)) +#loc172 = loc("x"(#loc37)) +#loc173 = loc("idxs"(#loc37)) +#loc174 = loc("x"(#loc41)) +#loc175 = loc("idxs"(#loc41)) +#loc180 = loc("x"(#loc49)) +#loc181 = loc("idxs"(#loc49)) +#loc182 = loc("flip"(#loc49)) +#loc238 = loc("input"(#loc112)) +#loc239 = loc("a"(#loc116)) +#loc240 = loc("b"(#loc116)) +#loc242 = loc("x"(#loc121)) +#loc243 = loc("x"(#loc125)) +#loc244 = loc("input"(#loc134)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc144) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc145) + %xoffset = tt.get_program_id x : i32 loc(#loc146) + %xoffset_2 = arith.constant 1 : i32 loc(#loc147) + %xoffset_3 = arith.constant 1 : i32 loc(#loc147) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc147) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc148) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc149) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc150) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc150) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc151) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc151) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc152) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc153) + %r0_offset = arith.constant 0 : i32 loc(#loc154) + %r0_mask = arith.constant true loc(#loc155) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc155) + %x0 = arith.constant 16 : i32 loc(#loc156) + %x0_11 = arith.constant 16 : i32 loc(#loc156) + %x0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc156) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<1x1xi32> loc(#loc156) + %x1 = arith.constant 16 : i32 loc(#loc157) + %x1_14 = arith.constant 16 : i32 loc(#loc157) + %x1_15 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc157) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<1x1xi32> loc(#loc157) + %tmp0 = arith.constant 17 : i32 loc(#loc158) + %tmp0_17 = arith.constant 17 : i32 loc(#loc158) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc158) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc158) + %tmp0_20 = tt.broadcast %x0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc159) + %tmp0_21 = arith.addi %tmp0_20, %tmp0_19 : tensor<1x16xi32> loc(#loc159) + %tmp0_22 = arith.constant 272 : i32 loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc160) + %tmp0_24 = arith.constant dense<272> : tensor<1x1xi32> loc(#loc160) + %tmp0_25 = arith.muli %tmp0_24, %x1_16 : tensor<1x1xi32> loc(#loc160) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc161) + %tmp0_27 = arith.addi %tmp0_21, %tmp0_26 : tensor<1x16xi32> loc(#loc161) + %tmp0_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc162) + %tmp0_29 = tt.addptr %tmp0_28, %tmp0_27 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc162) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp0_31 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc163) + %tmp0_32 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc163) + %tmp0_33 = arith.fptosi %tmp0_32 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc163) + %tmp0_34 = tt.load %tmp0_29, %tmp0_31, %tmp0_33 : tensor<1x16x!tt.ptr> loc(#loc163) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc164) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_34, %tmp2) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc22) + %tmp7 = arith.extsi %tmp0_34 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc165) + %tmp10 = arith.constant 0 : i32 loc(#loc166) + %tmp10_35 = arith.constant 0 : i64 loc(#loc166) + %tmp10_36 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc166) + %tmp10_37 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc166) + %tmp10_38 = arith.select %tmp10_37, %tmp7, %tmp10_36 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc166) + %tmp11 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_38) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc167) + %tmp11_39 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc168) + %tmp12 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc169) + %tmp13 = arith.trunci %tmp12 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc170) + %tmp14 = arith.trunci %tmp11_39 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc171) + %c16_i32 = arith.constant 16 : i32 loc(#loc30) + %c16_i32_40 = arith.constant 16 : i32 loc(#loc30) + %cst = arith.constant dense<16> : tensor<1x1xi32> loc(#loc30) + %1 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc30) + %2 = tt.broadcast %1 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc31) + %3 = arith.addi %r0_index_9, %2 : tensor<1x16xi32> loc(#loc31) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc32) + %5 = tt.addptr %4, %3 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc32) + %6 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc33) + tt.store %5, %tmp13, %6 : tensor<1x16x!tt.ptr> loc(#loc33) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc34) + %8 = tt.addptr %7, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc34) + tt.store %8, %tmp14, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc35) + tt.return loc(#loc36) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc37)), %idxs: tensor<1x16xi16> loc("idxs"(#loc37))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc39) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc40) + %5 = ub.poison : tensor<1x16xi32> loc(#loc40) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc40) + } loc(#loc37) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi16> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc48) + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi16> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc197) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc198) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc199) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc199) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc203) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc204) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc204) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc215) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc215) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc216) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc221) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc222) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc223) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc224) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc225) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc226) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc227) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc228) + %cond_38 = arith.constant 0 : i32 loc(#loc229) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc234) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc235) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc236) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc237) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc116)), %b: i32 loc("b"(#loc116))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc117) + tt.return %0 : i32 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc119) + tt.return %1 : i32 loc(#loc119) + } loc(#loc116) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc241) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc121))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc122) + %false = arith.constant false loc(#loc123) + tt.return %false : i1 loc(#loc123) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc124) + tt.return %1 : i1 loc(#loc124) + } loc(#loc121) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc125))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc126) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc127) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc127) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc127) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc127) + tt.return %4 : tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %5 : tensor<1x16xi32> loc(#loc129) + } loc(#loc125) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc131) + %cst = arith.constant dense : tensor<1xi1> loc(#loc131) + tt.return %cst : tensor<1xi1> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc133) + tt.return %0 : tensor<1xi1> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc134))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc135) + tt.return %0 : tensor<1x16xi32> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc137) + tt.return %1 : tensor<1x16xi32> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc131) + tt.return %cst : tensor<1x16xi32> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc133) + tt.return %0 : tensor<1x16xi32> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc134))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc135) + tt.return %0 : tensor<1x16xi16> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc137) + tt.return %1 : tensor<1x16xi16> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc131) + tt.return %cst : tensor<1x16xi16> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc133) + tt.return %0 : tensor<1x16xi16> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc112))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc113) + tt.return %0 : tensor<4x2xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc115) + tt.return %1 : tensor<4x2xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc112))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc113) + tt.return %0 : tensor<2x4xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc115) + tt.return %1 : tensor<2x4xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc245) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + %5 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc112))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc113) + tt.return %0 : tensor<1x8xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc115) + tt.return %1 : tensor<1x8xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc112))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc113) + tt.reduce.return %2 : i64 loc(#loc113) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc113) + tt.return %0 : tensor<1xi64> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc115) + tt.return %1 : tensor<1xi64> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc116)), %b: i64 loc("b"(#loc116))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc117) + tt.return %0 : i64 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc119) + tt.return %1 : i64 loc(#loc119) + } loc(#loc116) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":46:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":47:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc144 = loc("xnumel"(#loc1)) +#loc145 = loc("r0_numel"(#loc2)) +#loc146 = loc("xoffset"(#loc3)) +#loc147 = loc("xoffset"(#loc4)) +#loc148 = loc("xindex"(#loc5)) +#loc149 = loc("xindex"(#loc6)) +#loc150 = loc("xindex"(#loc7)) +#loc151 = loc("xmask"(#loc8)) +#loc152 = loc("r0_index"(#loc9)) +#loc153 = loc("r0_index"(#loc10)) +#loc154 = loc("r0_offset"(#loc11)) +#loc155 = loc("r0_mask"(#loc12)) +#loc156 = loc("x0"(#loc13)) +#loc157 = loc("x1"(#loc14)) +#loc158 = loc("tmp0"(#loc15)) +#loc159 = loc("tmp0"(#loc16)) +#loc160 = loc("tmp0"(#loc17)) +#loc161 = loc("tmp0"(#loc18)) +#loc162 = loc("tmp0"(#loc19)) +#loc163 = loc("tmp0"(#loc20)) +#loc164 = loc("tmp2"(#loc21)) +#loc165 = loc("tmp7"(#loc23)) +#loc166 = loc("tmp10"(#loc24)) +#loc167 = loc("tmp11"(#loc25)) +#loc168 = loc("tmp11"(#loc26)) +#loc169 = loc("tmp12"(#loc27)) +#loc170 = loc("tmp13"(#loc28)) +#loc171 = loc("tmp14"(#loc29)) +#loc176 = loc("flip"(#loc42)) +#loc177 = loc("flip"(#loc43)) +#loc178 = loc("flip"(#loc44)) +#loc179 = loc("flip"(#loc45)) +#loc183 = loc("y"(#loc50)) +#loc184 = loc("right_mask"(#loc51)) +#loc185 = loc("right_mask"(#loc52)) +#loc186 = loc("left_mask"(#loc53)) +#loc187 = loc("ileft"(#loc54)) +#loc188 = loc("ileft"(#loc55)) +#loc189 = loc("ileft"(#loc56)) +#loc190 = loc("ileft"(#loc57)) +#loc191 = loc("iright"(#loc58)) +#loc192 = loc("iright"(#loc59)) +#loc193 = loc("iright"(#loc60)) +#loc194 = loc("iright"(#loc61)) +#loc195 = loc("ileft"(#loc62)) +#loc196 = loc("iright"(#loc63)) +#loc197 = loc("y_idx"(#loc64)) +#loc198 = loc("left_idx"(#loc65)) +#loc199 = loc("left_idx"(#loc66)) +#loc200 = loc("left_idx"(#loc67)) +#loc201 = loc("left_idx"(#loc68)) +#loc202 = loc("left_idx"(#loc69)) +#loc203 = loc("right_idx"(#loc70)) +#loc204 = loc("right_idx"(#loc71)) +#loc205 = loc("right_idx"(#loc72)) +#loc206 = loc("right_idx"(#loc73)) +#loc207 = loc("right_idx"(#loc74)) +#loc208 = loc("left_idx"(#loc75)) +#loc209 = loc("right_idx"(#loc76)) +#loc210 = loc("left_valid_mask"(#loc77)) +#loc211 = loc("right_valid_mask"(#loc78)) +#loc212 = loc("left_isnan"(#loc79)) +#loc213 = loc("right_isnan"(#loc80)) +#loc214 = loc("cond"(#loc81)) +#loc215 = loc("cond"(#loc84)) +#loc216 = loc("cond"(#loc85)) +#loc217 = loc("cond"(#loc86)) +#loc218 = loc("eq"(#loc88)) +#loc219 = loc("eq"(#loc91)) +#loc220 = loc("eq"(#loc92)) +#loc221 = loc("cond"(#loc93)) +#loc222 = loc("cond"(#loc94)) +#loc223 = loc("cond"(#loc95)) +#loc224 = loc("cond"(#loc96)) +#loc225 = loc("cond"(#loc97)) +#loc226 = loc("cond"(#loc98)) +#loc227 = loc("cond"(#loc99)) +#loc228 = loc("cond"(#loc100)) +#loc229 = loc("cond"(#loc101)) +#loc230 = loc("ret"(#loc102)) +#loc231 = loc("ret"(#loc103)) +#loc232 = loc("ret"(#loc104)) +#loc233 = loc("ret"(#loc105)) +#loc234 = loc("new_idxs"(#loc106)) +#loc235 = loc("new_idxs"(#loc107)) +#loc236 = loc("new_idxs"(#loc108)) +#loc237 = loc("new_idxs"(#loc109)) +#loc241 = loc("input"(#loc120)) +#loc245 = loc("flip"(#loc138)) +#loc246 = loc("cond"(#loc214)) +#loc247 = loc("cond"(#loc217)) +#loc248 = loc("eq"(#loc218)) +#loc249 = loc("eq"(#loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..dde8978e2afc3ed69d5ec1da625c8d7565e88448 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,812 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26) +#loc73 = loc("in_ptr0"(#loc)) +#loc74 = loc("out_ptr2"(#loc)) +#loc75 = loc("out_ptr3"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc91 = loc(callsite(#loc15 at #loc16)) +#loc97 = loc("ileft"(#loc24)) +#loc101 = loc("iright"(#loc29)) +#loc110 = loc("left_idx"(#loc38)) +#loc115 = loc("right_idx"(#loc43)) +#loc135 = loc("tmp11"(#loc63)) +#loc145 = loc(callsite(#loc20 at #loc91)) +#loc149 = loc(callsite(#loc1 at #loc135)) +#loc153 = loc(callsite(#loc97 at #loc145)) +#loc157 = loc(callsite(#loc101 at #loc145)) +#loc165 = loc(callsite(#loc110 at #loc145)) +#loc170 = loc(callsite(#loc115 at #loc145)) +#loc190 = loc(callsite(#loc1 at #loc153)) +#loc192 = loc(callsite(#loc1 at #loc157)) +#loc195 = loc(callsite(#loc1 at #loc165)) +#loc198 = loc(callsite(#loc1 at #loc170)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c272_i32 = arith.constant 272 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc80) + %r0_index_8 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc80) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc80) + %r0_index_10 = tt.expand_dims %r0_index_8 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc80) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc81) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc82) + %tmp0 = arith.muli %r0_index_9, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc83) + %tmp0_11 = arith.muli %r0_index_10, %cst : tensor<1x16xi32, #blocked> loc(#loc83) + %tmp0_12 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc139) + %tmp0_13 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc139) + %tmp0_14 = arith.addi %tmp0_12, %tmp0 : tensor<1x16xi32, #blocked5> loc(#loc84) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_11 : tensor<1x16xi32, #blocked> loc(#loc84) + %tmp0_16 = arith.muli %x1, %c272_i32 : i32 loc(#loc85) + %tmp0_17 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc140) + %tmp0_18 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked> loc(#loc140) + %tmp0_19 = arith.addi %tmp0_14, %tmp0_17 : tensor<1x16xi32, #blocked5> loc(#loc86) + %tmp0_20 = arith.addi %tmp0_15, %tmp0_18 : tensor<1x16xi32, #blocked> loc(#loc86) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc87) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc87) + %tmp0_23 = tt.addptr %tmp0_21, %tmp0_19 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc87) + %tmp0_24 = tt.addptr %tmp0_22, %tmp0_20 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc87) + %tmp0_25 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc141) + %tmp0_26 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc141) + %tmp0_27 = tt.load %tmp0_23, %tmp0_26, %cst_6 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc88) + %tmp0_28 = tt.load %tmp0_24, %tmp0_25, %cst_4 : tensor<1x16x!tt.ptr, #blocked> loc(#loc88) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc89) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc142) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc142) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc142) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc142) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc142) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc142) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc142) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc142) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc142) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc142) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc142) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc142) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc143) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y = tt.reshape %tmp0_27 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %left_mask = arith.subi %cst_5, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc151) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc151) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc151) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc151) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc162) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc163) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc164) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc164) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc168) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc169) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc169) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc196) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_75 = arith.xori %tmp0_27, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_77 = arith.extsi %tmp2 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc187) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc143) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc143) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc150) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc189) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc154) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc155) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc156) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc191) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc158) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc159) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc162) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc164) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc194) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc166) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc167) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc169) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc197) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc171) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc172) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc179) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc187) + %tmp7 = arith.extsi %tmp0_28 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc133) + %tmp10 = arith.select %tmp0_25, %tmp7, %cst_0 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc134) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_396: i64 loc(callsite(#loc1 at #loc135)), %tmp11_397: i64 loc(callsite(#loc1 at #loc135))): + %tmp11_398 = arith.addi %tmp11_396, %tmp11_397 : i64 loc(#loc188) + tt.reduce.return %tmp11_398 : i64 loc(#loc148) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc148) + %tmp11_395 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc136) + %tmp14 = arith.trunci %tmp11_395 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc137) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc66) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc138) + %2 = arith.addi %r0_index_9, %1 : tensor<1x16xi32, #blocked5> loc(#loc67) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc68) + tt.store %4, %new_idxs_394, %tmp0_26 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc69) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc70) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + %7 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc71) + tt.store %6, %tmp14, %7 : tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + tt.return loc(#loc72) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_index"(#loc4)) +#loc81 = loc("x0"(#loc5)) +#loc82 = loc("x1"(#loc6)) +#loc83 = loc("tmp0"(#loc7)) +#loc84 = loc("tmp0"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp2"(#loc13)) +#loc90 = loc("flip"(#loc14)) +#loc92 = loc("flip"(#loc17)) +#loc93 = loc("flip"(#loc18)) +#loc94 = loc("y"(#loc19)) +#loc95 = loc("left_mask"(#loc21)) +#loc96 = loc("ileft"(#loc22)) +#loc98 = loc("ileft"(#loc26)) +#loc99 = loc("ileft"(#loc27)) +#loc100 = loc("iright"(#loc28)) +#loc102 = loc("iright"(#loc30)) +#loc103 = loc("iright"(#loc31)) +#loc104 = loc("ileft"(#loc32)) +#loc105 = loc("iright"(#loc33)) +#loc106 = loc("y_idx"(#loc34)) +#loc107 = loc("left_idx"(#loc35)) +#loc108 = loc("left_idx"(#loc36)) +#loc109 = loc("input"(#loc37)) +#loc111 = loc("left_idx"(#loc39)) +#loc112 = loc("left_idx"(#loc40)) +#loc113 = loc("right_idx"(#loc41)) +#loc114 = loc("right_idx"(#loc42)) +#loc116 = loc("right_idx"(#loc44)) +#loc117 = loc("right_idx"(#loc45)) +#loc118 = loc("left_idx"(#loc46)) +#loc119 = loc("right_idx"(#loc47)) +#loc120 = loc("cond"(#loc48)) +#loc121 = loc("eq"(#loc49)) +#loc122 = loc("cond"(#loc50)) +#loc123 = loc("cond"(#loc51)) +#loc124 = loc("cond"(#loc52)) +#loc125 = loc("cond"(#loc53)) +#loc126 = loc("cond"(#loc54)) +#loc127 = loc("ret"(#loc55)) +#loc128 = loc("ret"(#loc56)) +#loc129 = loc("ret"(#loc57)) +#loc130 = loc("new_idxs"(#loc58)) +#loc131 = loc("new_idxs"(#loc59)) +#loc132 = loc("new_idxs"(#loc60)) +#loc133 = loc("tmp7"(#loc61)) +#loc134 = loc("tmp10"(#loc62)) +#loc136 = loc("tmp11"(#loc64)) +#loc137 = loc("tmp14"(#loc65)) +#loc138 = loc(fused[#loc67, #loc66]) +#loc139 = loc(fused[#loc84, #loc81]) +#loc140 = loc(fused[#loc86, #loc85]) +#loc141 = loc(fused[#loc88, #loc79]) +#loc142 = loc(callsite(#loc90 at #loc91)) +#loc143 = loc(callsite(#loc92 at #loc91)) +#loc144 = loc(callsite(#loc93 at #loc91)) +#loc146 = loc("cond"(#loc120)) +#loc147 = loc("eq"(#loc121)) +#loc148 = loc(callsite(#loc23 at #loc135)) +#loc150 = loc(callsite(#loc94 at #loc145)) +#loc151 = loc(callsite(#loc95 at #loc145)) +#loc152 = loc(callsite(#loc96 at #loc145)) +#loc154 = loc(callsite(#loc98 at #loc145)) +#loc155 = loc(callsite(#loc99 at #loc145)) +#loc156 = loc(callsite(#loc100 at #loc145)) +#loc158 = loc(callsite(#loc102 at #loc145)) +#loc159 = loc(callsite(#loc103 at #loc145)) +#loc160 = loc(callsite(#loc104 at #loc145)) +#loc161 = loc(callsite(#loc105 at #loc145)) +#loc162 = loc(callsite(#loc106 at #loc145)) +#loc163 = loc(callsite(#loc107 at #loc145)) +#loc164 = loc(callsite(#loc108 at #loc145)) +#loc166 = loc(callsite(#loc111 at #loc145)) +#loc167 = loc(callsite(#loc112 at #loc145)) +#loc168 = loc(callsite(#loc113 at #loc145)) +#loc169 = loc(callsite(#loc114 at #loc145)) +#loc171 = loc(callsite(#loc116 at #loc145)) +#loc172 = loc(callsite(#loc117 at #loc145)) +#loc173 = loc(callsite(#loc118 at #loc145)) +#loc174 = loc(callsite(#loc119 at #loc145)) +#loc175 = loc(callsite(#loc146 at #loc145)) +#loc176 = loc(callsite(#loc147 at #loc145)) +#loc177 = loc(callsite(#loc122 at #loc145)) +#loc178 = loc(callsite(#loc123 at #loc145)) +#loc179 = loc(callsite(#loc124 at #loc145)) +#loc180 = loc(callsite(#loc125 at #loc145)) +#loc181 = loc(callsite(#loc126 at #loc145)) +#loc182 = loc(callsite(#loc127 at #loc145)) +#loc183 = loc(callsite(#loc128 at #loc145)) +#loc184 = loc(callsite(#loc129 at #loc145)) +#loc185 = loc(callsite(#loc130 at #loc145)) +#loc186 = loc(callsite(#loc131 at #loc145)) +#loc187 = loc(callsite(#loc132 at #loc145)) +#loc188 = loc(callsite(#loc25 at #loc148)) +#loc189 = loc(callsite(#loc23 at #loc153)) +#loc191 = loc(callsite(#loc23 at #loc157)) +#loc193 = loc(callsite(#loc109 at #loc165)) +#loc194 = loc(callsite(#loc23 at #loc165)) +#loc196 = loc(callsite(#loc109 at #loc170)) +#loc197 = loc(callsite(#loc23 at #loc170)) +#loc199 = loc(callsite(#loc25 at #loc189)) +#loc200 = loc(callsite(#loc25 at #loc191)) +#loc201 = loc(callsite(#loc25 at #loc194)) +#loc202 = loc(callsite(#loc25 at #loc197)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5ea7aa347a0b24d5774f48095ae95a3d658a69a3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,784 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0) +#loc2 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26) +#loc75 = loc("in_ptr0"(#loc)) +#loc76 = loc("out_ptr2"(#loc)) +#loc77 = loc("out_ptr3"(#loc)) +#loc78 = loc("xnumel"(#loc)) +#loc79 = loc("r0_numel"(#loc)) +#loc96 = loc(callsite(#loc18 at #loc4)) +#loc103 = loc("ileft"(#loc27)) +#loc107 = loc("iright"(#loc32)) +#loc116 = loc("left_idx"(#loc41)) +#loc121 = loc("right_idx"(#loc46)) +#loc140 = loc("tmp11"(#loc65)) +#loc151 = loc(callsite(#loc23 at #loc96)) +#loc155 = loc(callsite(#loc2 at #loc140)) +#loc159 = loc(callsite(#loc103 at #loc151)) +#loc163 = loc(callsite(#loc107 at #loc151)) +#loc171 = loc(callsite(#loc116 at #loc151)) +#loc176 = loc(callsite(#loc121 at #loc151)) +#loc196 = loc(callsite(#loc2 at #loc159)) +#loc198 = loc(callsite(#loc2 at #loc163)) +#loc201 = loc(callsite(#loc2 at #loc171)) +#loc204 = loc(callsite(#loc2 at #loc176)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant 272 : i32 loc(#loc80) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %xmask = arith.constant 32 : i32 loc(#loc81) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc82) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc2) + %tmp10 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc83) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xmask_2 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc81) + %xmask_3 = tt.splat %xmask_2 : i1 -> tensor<1x1xi1> loc(#loc81) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc86) + %r0_index_4 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc87) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc88) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc89) + %tmp0_5 = arith.muli %r0_index_4, %tmp0_1 : tensor<1x16xi32> loc(#loc84) + %tmp0_6 = tt.splat %x0 : i32 -> tensor<1x16xi32> loc(#loc144) + %tmp0_7 = arith.addi %tmp0_6, %tmp0_5 : tensor<1x16xi32> loc(#loc90) + %tmp0_8 = arith.muli %x1, %tmp0 : i32 loc(#loc80) + %tmp0_9 = tt.splat %tmp0_8 : i32 -> tensor<1x16xi32> loc(#loc145) + %tmp0_10 = arith.addi %tmp0_7, %tmp0_9 : tensor<1x16xi32> loc(#loc91) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc92) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc92) + %tmp0_13 = tt.splat %xmask_2 : i1 -> tensor<1x16xi1> loc(#loc146) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 : tensor<1x16x!tt.ptr> loc(#loc93) + %tmp2 = arith.trunci %r0_index_4 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc94) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc147) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc148) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc148) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc149) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc150) + %y = tt.reshape %tmp0_14 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc157) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc158) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc162) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc168) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc169) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc170) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc170) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc199) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc174) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc175) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc175) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc202) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc181) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc182) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc183) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc184) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc185) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc186) + %cond_47 = arith.cmpi ne, %cond_46, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc188) + %ret_48 = arith.select %cond_47, %ret, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_49 = arith.xori %tmp0_14, %ret_48 : tensor<1x16xi32> loc(#loc190) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc191) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_51 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc193) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc193) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc149) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc150) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc158) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc181) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc182) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc183) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc184) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc185) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_85 = arith.cmpi ne, %cond_84, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc188) + %ret_87 = arith.select %cond_85, %ret_86, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc190) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc191) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc193) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc181) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc182) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc183) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc184) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc185) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_121 = arith.cmpi ne, %cond_120, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc188) + %ret_123 = arith.select %cond_121, %ret_122, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc190) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc191) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc193) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc149) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc150) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc158) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc181) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc182) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc183) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc184) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc185) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_160 = arith.cmpi ne, %cond_159, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc188) + %ret_162 = arith.select %cond_160, %ret_161, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc190) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc191) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc193) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc181) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc182) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc183) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc184) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc185) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_196 = arith.cmpi ne, %cond_195, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc188) + %ret_198 = arith.select %cond_196, %ret_197, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc190) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc191) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc193) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc181) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc182) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc183) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc184) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc185) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_232 = arith.cmpi ne, %cond_231, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc188) + %ret_234 = arith.select %cond_232, %ret_233, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc190) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc191) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc193) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc156) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc158) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc158) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc195) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc160) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc161) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc162) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc197) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc164) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc165) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc168) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc170) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc172) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc173) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc175) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc203) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc177) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc178) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc181) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc182) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc183) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc184) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc185) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc188) + %ret_268 = arith.select %cond_266, %ret_267, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc190) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc191) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc193) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc181) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc182) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc183) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc184) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc185) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc188) + %ret_301 = arith.select %cond_299, %ret_300, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc190) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc191) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc193) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc181) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc182) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc183) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc184) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc185) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc188) + %ret_334 = arith.select %cond_332, %ret_333, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc190) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc191) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc193) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc181) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc182) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc183) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc184) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc185) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc191) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc193) + %tmp7 = arith.extsi %tmp0_14 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc139) + %tmp10_369 = arith.select %tmp0_13, %tmp7, %tmp10 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc83) + %tmp11 = "tt.reduce"(%tmp10_369) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_371: i64 loc(callsite(#loc2 at #loc140)), %tmp11_372: i64 loc(callsite(#loc2 at #loc140))): + %tmp11_373 = arith.addi %tmp11_371, %tmp11_372 : i64 loc(#loc194) + tt.reduce.return %tmp11_373 : i64 loc(#loc154) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc154) + %tmp11_370 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc141) + %tmp14 = arith.trunci %tmp11_370 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc142) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc68) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32> loc(#loc143) + %2 = arith.addi %r0_index_4, %1 : tensor<1x16xi32> loc(#loc69) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc70) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc70) + tt.store %4, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc71) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc72) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc72) + tt.store %6, %tmp14, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc73) + tt.return loc(#loc74) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4) +#loc80 = loc("tmp0"(#loc1)) +#loc81 = loc("xmask"(#loc3)) +#loc82 = loc(callsite(#loc2 at #loc4)) +#loc83 = loc("tmp10"(#loc5)) +#loc84 = loc("tmp0"(#loc6)) +#loc85 = loc("xoffset"(#loc7)) +#loc86 = loc("r0_index"(#loc8)) +#loc87 = loc("r0_index"(#loc9)) +#loc88 = loc("x0"(#loc10)) +#loc89 = loc("x1"(#loc11)) +#loc90 = loc("tmp0"(#loc12)) +#loc91 = loc("tmp0"(#loc13)) +#loc92 = loc("tmp0"(#loc14)) +#loc93 = loc("tmp0"(#loc15)) +#loc94 = loc("tmp2"(#loc16)) +#loc95 = loc("flip"(#loc17)) +#loc97 = loc("flip"(#loc19)) +#loc98 = loc("flip"(#loc20)) +#loc99 = loc("flip"(#loc21)) +#loc100 = loc("y"(#loc22)) +#loc101 = loc("left_mask"(#loc24)) +#loc102 = loc("ileft"(#loc25)) +#loc104 = loc("ileft"(#loc29)) +#loc105 = loc("ileft"(#loc30)) +#loc106 = loc("iright"(#loc31)) +#loc108 = loc("iright"(#loc33)) +#loc109 = loc("iright"(#loc34)) +#loc110 = loc("ileft"(#loc35)) +#loc111 = loc("iright"(#loc36)) +#loc112 = loc("y_idx"(#loc37)) +#loc113 = loc("left_idx"(#loc38)) +#loc114 = loc("left_idx"(#loc39)) +#loc115 = loc("input"(#loc40)) +#loc117 = loc("left_idx"(#loc42)) +#loc118 = loc("left_idx"(#loc43)) +#loc119 = loc("right_idx"(#loc44)) +#loc120 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc47)) +#loc123 = loc("right_idx"(#loc48)) +#loc124 = loc("left_idx"(#loc49)) +#loc125 = loc("right_idx"(#loc50)) +#loc126 = loc("cond"(#loc51)) +#loc127 = loc("eq"(#loc52)) +#loc128 = loc("cond"(#loc53)) +#loc129 = loc("cond"(#loc54)) +#loc130 = loc("cond"(#loc55)) +#loc131 = loc("cond"(#loc56)) +#loc132 = loc("cond"(#loc57)) +#loc133 = loc("ret"(#loc58)) +#loc134 = loc("ret"(#loc59)) +#loc135 = loc("ret"(#loc60)) +#loc136 = loc("new_idxs"(#loc61)) +#loc137 = loc("new_idxs"(#loc62)) +#loc138 = loc("new_idxs"(#loc63)) +#loc139 = loc("tmp7"(#loc64)) +#loc141 = loc("tmp11"(#loc66)) +#loc142 = loc("tmp14"(#loc67)) +#loc143 = loc(fused[#loc69, #loc68]) +#loc144 = loc(fused[#loc90, #loc88]) +#loc145 = loc(fused[#loc91, #loc80]) +#loc146 = loc(fused[#loc93, #loc81]) +#loc147 = loc(callsite(#loc95 at #loc96)) +#loc148 = loc(callsite(#loc97 at #loc96)) +#loc149 = loc(callsite(#loc98 at #loc96)) +#loc150 = loc(callsite(#loc99 at #loc96)) +#loc152 = loc("cond"(#loc126)) +#loc153 = loc("eq"(#loc127)) +#loc154 = loc(callsite(#loc26 at #loc140)) +#loc156 = loc(callsite(#loc100 at #loc151)) +#loc157 = loc(callsite(#loc101 at #loc151)) +#loc158 = loc(callsite(#loc102 at #loc151)) +#loc160 = loc(callsite(#loc104 at #loc151)) +#loc161 = loc(callsite(#loc105 at #loc151)) +#loc162 = loc(callsite(#loc106 at #loc151)) +#loc164 = loc(callsite(#loc108 at #loc151)) +#loc165 = loc(callsite(#loc109 at #loc151)) +#loc166 = loc(callsite(#loc110 at #loc151)) +#loc167 = loc(callsite(#loc111 at #loc151)) +#loc168 = loc(callsite(#loc112 at #loc151)) +#loc169 = loc(callsite(#loc113 at #loc151)) +#loc170 = loc(callsite(#loc114 at #loc151)) +#loc172 = loc(callsite(#loc117 at #loc151)) +#loc173 = loc(callsite(#loc118 at #loc151)) +#loc174 = loc(callsite(#loc119 at #loc151)) +#loc175 = loc(callsite(#loc120 at #loc151)) +#loc177 = loc(callsite(#loc122 at #loc151)) +#loc178 = loc(callsite(#loc123 at #loc151)) +#loc179 = loc(callsite(#loc124 at #loc151)) +#loc180 = loc(callsite(#loc125 at #loc151)) +#loc181 = loc(callsite(#loc152 at #loc151)) +#loc182 = loc(callsite(#loc153 at #loc151)) +#loc183 = loc(callsite(#loc128 at #loc151)) +#loc184 = loc(callsite(#loc129 at #loc151)) +#loc185 = loc(callsite(#loc130 at #loc151)) +#loc186 = loc(callsite(#loc131 at #loc151)) +#loc187 = loc(callsite(#loc132 at #loc151)) +#loc188 = loc(callsite(#loc133 at #loc151)) +#loc189 = loc(callsite(#loc134 at #loc151)) +#loc190 = loc(callsite(#loc135 at #loc151)) +#loc191 = loc(callsite(#loc136 at #loc151)) +#loc192 = loc(callsite(#loc137 at #loc151)) +#loc193 = loc(callsite(#loc138 at #loc151)) +#loc194 = loc(callsite(#loc28 at #loc154)) +#loc195 = loc(callsite(#loc26 at #loc159)) +#loc197 = loc(callsite(#loc26 at #loc163)) +#loc199 = loc(callsite(#loc115 at #loc171)) +#loc200 = loc(callsite(#loc26 at #loc171)) +#loc202 = loc(callsite(#loc115 at #loc176)) +#loc203 = loc(callsite(#loc26 at #loc176)) +#loc205 = loc(callsite(#loc28 at #loc195)) +#loc206 = loc(callsite(#loc28 at #loc197)) +#loc207 = loc(callsite(#loc28 at #loc200)) +#loc208 = loc(callsite(#loc28 at #loc203)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f774d84e8ed0ecd2b5c7894fd48893e5b8d0a9c6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source", "triton_red_fused__to_copy_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir", "triton_red_fused__to_copy_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir", "triton_red_fused__to_copy_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir", "triton_red_fused__to_copy_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx", "triton_red_fused__to_copy_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin", "triton_red_fused__to_copy_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a36847395e00b0848183e6a997dfb0baf7aeb6e7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f737489312fbe2bdc5a3a555c46a66a359a2eef3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json @@ -0,0 +1 @@ +{"hash": "3de7644d9cd8133c07031313b905c396b634b96b77247524ade62d0919a3570b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..43de74b744d620c1a039b35a4892a1bd0d9b0ce0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %.fr5 = freeze i32 %4, !dbg !8 + %10 = icmp slt i32 %9, %.fr5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 15, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %14 = icmp sgt i32 %5, 0, !dbg !11 + br i1 %14, label %.lr.ph, label %._crit_edge, !dbg !11 + +.lr.ph: ; preds = %8 + %15 = mul i64 %2, %13, !dbg !10 + %16 = getelementptr i32, ptr addrspace(1) %0, i64 %15 + br i1 %10, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %17 = phi i32 [ %23, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %18 = or disjoint i32 %17, %12, !dbg !12 + %19 = sext i32 %18 to i64, !dbg !13 + %20 = getelementptr i32, ptr addrspace(1) %16, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #3, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %20, i64 %21, i1 false) #3, !dbg !15 + %23 = add i32 %17, 16, !dbg !11 + %24 = icmp slt i32 %23, %5, !dbg !11 + br i1 %24, label %.lr.ph.split.us, label %._crit_edge, !dbg !11 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %25 = phi i64 [ %33, %.lr.ph.split ], [ 0, %.lr.ph ] + %26 = phi i32 [ %34, %.lr.ph.split ], [ 0, %.lr.ph ] + %27 = or disjoint i32 %26, %12, !dbg !12 + %28 = icmp slt i32 %27, %5, !dbg !16 + %29 = sext i32 %27 to i64, !dbg !13 + %30 = getelementptr i32, ptr addrspace(1) %16, i64 %29, !dbg !14 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #3, !dbg !15 + %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %30, i64 %31, i1 %28) #3, !dbg !15 + %narrow = select i1 %28, i32 %32, i32 0, !dbg !17 + %spec.select = sext i32 %narrow to i64, !dbg !17 + %33 = add i64 %25, %spec.select, !dbg !17 + %34 = add i32 %26, 16, !dbg !11 + %35 = icmp slt i32 %34, %5, !dbg !11 + br i1 %35, label %.lr.ph.split, label %._crit_edge, !dbg !11 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %.lcssa = phi i64 [ 0, %8 ], [ %33, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !18 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !19 + %36 = trunc nuw i64 %extelt.offset to i32, !dbg !19 + %37 = trunc i64 %.lcssa to i32, !dbg !19 + %38 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %37, i32 8, i32 31), !dbg !19 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 8, i32 31), !dbg !19 + %40 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !19 + %41 = insertelement <2 x i32> %40, i32 %39, i64 1, !dbg !19 + %42 = bitcast <2 x i32> %41 to i64, !dbg !19 + %43 = add i64 %.lcssa, %42, !dbg !23 + %extelt.offset2 = lshr i64 %43, 32, !dbg !19 + %44 = trunc nuw i64 %extelt.offset2 to i32, !dbg !19 + %45 = trunc i64 %43 to i32, !dbg !19 + %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 4, i32 31), !dbg !19 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !19 + %48 = insertelement <2 x i32> poison, i32 %46, i64 0, !dbg !19 + %49 = insertelement <2 x i32> %48, i32 %47, i64 1, !dbg !19 + %50 = bitcast <2 x i32> %49 to i64, !dbg !19 + %51 = add i64 %43, %50, !dbg !23 + %extelt.offset3 = lshr i64 %51, 32, !dbg !19 + %52 = trunc nuw i64 %extelt.offset3 to i32, !dbg !19 + %53 = trunc i64 %51 to i32, !dbg !19 + %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 2, i32 31), !dbg !19 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 2, i32 31), !dbg !19 + %56 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !19 + %57 = insertelement <2 x i32> %56, i32 %55, i64 1, !dbg !19 + %58 = bitcast <2 x i32> %57 to i64, !dbg !19 + %59 = add i64 %51, %58, !dbg !23 + %extelt.offset4 = lshr i64 %59, 32, !dbg !19 + %60 = trunc nuw i64 %extelt.offset4 to i32, !dbg !19 + %61 = trunc i64 %59 to i32, !dbg !19 + %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !19 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 1, i32 31), !dbg !19 + %64 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !19 + %65 = insertelement <2 x i32> %64, i32 %63, i64 1, !dbg !19 + %66 = bitcast <2 x i32> %65 to i64, !dbg !19 + %67 = add i64 %59, %66, !dbg !23 + %.frozen = freeze i64 %3, !dbg !24 + %68 = sdiv i64 %13, %.frozen, !dbg !24 + %69 = mul i64 %68, %.frozen, !dbg !25 + %.decomposed = sub i64 %13, %69, !dbg !25 + %70 = trunc i64 %67 to i32, !dbg !26 + %71 = icmp slt i64 %3, 2, !dbg !27 + %72 = icmp sgt i64 %3, 1, !dbg !28 + %73 = select i1 %72, i64 %3, i64 0, !dbg !29 + %74 = zext i1 %71 to i64, !dbg !30 + %75 = add i64 %73, %74, !dbg !31 + %76 = mul i64 %68, %75, !dbg !32 + %77 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !33 + %78 = getelementptr i32, ptr addrspace(1) %77, i64 %76, !dbg !33 + %79 = and i32 %11, 63, !dbg !34 + %80 = icmp eq i32 %79, 0, !dbg !34 + %81 = and i1 %80, %10, !dbg !34 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %78, i1 %81) #3, !dbg !34 + ret void, !dbg !35 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_sum_2", linkageName: "triton_red_fused__to_copy_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 21, scope: !4) +!9 = !DILocation(line: 24, column: 37, scope: !4) +!10 = !DILocation(line: 34, column: 45, scope: !4) +!11 = !DILocation(line: 28, column: 40, scope: !4) +!12 = !DILocation(line: 29, column: 31, scope: !4) +!13 = !DILocation(line: 34, column: 41, scope: !4) +!14 = !DILocation(line: 34, column: 34, scope: !4) +!15 = !DILocation(line: 34, column: 50, scope: !4) +!16 = !DILocation(line: 30, column: 29, scope: !4) +!17 = !DILocation(line: 38, column: 48, scope: !4) +!18 = !DILocation(line: 27, column: 43, scope: !4) +!19 = !DILocation(line: 291, column: 36, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!22 = !DILocation(line: 39, column: 25, scope: !4) +!23 = !DILocation(line: 261, column: 15, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 41, column: 19, scope: !4) +!25 = !DILocation(line: 40, column: 19, scope: !4) +!26 = !DILocation(line: 42, column: 19, scope: !4) +!27 = !DILocation(line: 43, column: 49, scope: !4) +!28 = !DILocation(line: 43, column: 75, scope: !4) +!29 = !DILocation(line: 43, column: 66, scope: !4) +!30 = !DILocation(line: 43, scope: !4) +!31 = !DILocation(line: 43, column: 57, scope: !4) +!32 = !DILocation(line: 43, column: 34, scope: !4) +!33 = !DILocation(line: 43, column: 25, scope: !4) +!34 = !DILocation(line: 43, column: 88, scope: !4) +!35 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d76ce5d538a8c290d72678f9a6febfd9f42b969b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx @@ -0,0 +1,448 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_sum_2 // -- Begin function triton_red_fused__to_copy_sum_2 + // @triton_red_fused__to_copy_sum_2 +.visible .entry triton_red_fused__to_copy_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_1, + .param .u64 triton_red_fused__to_copy_sum_2_param_2, + .param .u64 triton_red_fused__to_copy_sum_2_param_3, + .param .u32 triton_red_fused__to_copy_sum_2_param_4, + .param .u32 triton_red_fused__to_copy_sum_2_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_7 +) +.reqntid 64 +{ + .reg .pred %p<13>; + .reg .b32 %r<45>; + .reg .b64 %rd<60>; + .loc 1 18 0 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:18:0 + +// %bb.0: + ld.param.b32 %r8, [triton_red_fused__to_copy_sum_2_param_5]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_sum_2_param_3]; +$L__tmp0: + .loc 1 21 28 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:21:28 + mov.u32 %r9, %ctaid.x; + ld.param.b32 %r10, [triton_red_fused__to_copy_sum_2_param_4]; + .loc 1 24 37 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:24:37 + mov.u32 %r2, %tid.x; + .loc 1 34 45 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:45 + cvt.u64.u32 %rd1, %r9; + .loc 1 28 40 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:28:40 + setp.lt.s32 %p1, %r8, 1; + mov.b64 %rd58, 0; + cvt.u32.u64 %r42, %rd1; + @%p1 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_sum_2_param_2]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_sum_2_param_0]; + and.b32 %r3, %r2, 15; + .loc 1 23 21 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:23:21 + setp.lt.s32 %p2, %r42, %r10; + .loc 1 34 45 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:45 + mul.lo.s64 %rd16, %rd13, %rd1; + shl.b64 %rd17, %rd16, 2; + add.s64 %rd2, %rd11, %rd17; + @%p2 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 45 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:0:45 + mov.b32 %r44, 0; + mov.b64 %rd58, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 30 29 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:30:29 + add.s32 %r17, %r3, %r44; + setp.lt.s32 %p5, %r17, %r8; + .loc 1 34 34 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:34 + mad.wide.s32 %rd24, %r17, 4, %rd2; + .loc 1 34 50 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:50 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r16, 0x0; + @%p5 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd24 + 0 ], %rd23; + // end inline asm + .loc 1 38 48 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:38:48 + selp.b32 %r18, %r16, 0, %p5; + cvt.s64.s32 %rd26, %r18; + add.s64 %rd58, %rd58, %rd26; + .loc 1 28 40 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:28:40 + add.s32 %r44, %r44, 16; + setp.lt.s32 %p6, %r44, %r8; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:0:40 + mov.b32 %r43, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 34 41 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:41 + add.s32 %r14, %r3, %r43; + .loc 1 34 34 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:34 + mad.wide.s32 %rd19, %r14, 4, %rd2; + .loc 1 34 50 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:34:50 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0; + // end inline asm + mov.pred %p3, 0; + // begin inline asm + mov.u32 %r13, 0x0; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r13 }, [ %rd19 + 0 ], %rd18; + // end inline asm + .loc 1 28 40 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:28:40 + add.s32 %r43, %r43, 16; + setp.lt.s32 %p4, %r43, %r8; + @%p4 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 0 40 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:0:40 + ld.param.b64 %rd12, [triton_red_fused__to_copy_sum_2_param_1]; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + mov.b64 {_, %r19}, %rd58; + cvt.u32.u64 %r20, %rd58; + shfl.sync.bfly.b32 %r21, %r20, 8, 31, -1; + shfl.sync.bfly.b32 %r22, %r19, 8, 31, -1; + cvt.u64.u32 %rd27, %r21; + cvt.u64.u32 %rd28, %r22; + shl.b64 %rd29, %rd28, 32; + or.b64 %rd30, %rd27, %rd29; + .loc 2 261 15 // standard.py:261:15 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + add.s64 %rd31, %rd58, %rd30; + .loc 2 291 36 // standard.py:291:36 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + mov.b64 {_, %r23}, %rd31; + cvt.u32.u64 %r24, %rd31; + shfl.sync.bfly.b32 %r25, %r24, 4, 31, -1; + shfl.sync.bfly.b32 %r26, %r23, 4, 31, -1; + cvt.u64.u32 %rd32, %r25; + cvt.u64.u32 %rd33, %r26; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + add.s64 %rd36, %rd31, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + mov.b64 {_, %r27}, %rd36; + cvt.u32.u64 %r28, %rd36; + shfl.sync.bfly.b32 %r29, %r28, 2, 31, -1; + shfl.sync.bfly.b32 %r30, %r27, 2, 31, -1; + cvt.u64.u32 %rd37, %r29; + cvt.u64.u32 %rd38, %r30; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + mov.b64 {_, %r31}, %rd41; + cvt.u32.u64 %r32, %rd41; + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + shfl.sync.bfly.b32 %r34, %r31, 1, 31, -1; + cvt.u64.u32 %rd42, %r33; + cvt.u64.u32 %rd43, %r34; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:39:25 ] + add.s64 %rd6, %rd41, %rd45; +$L__tmp2: + .loc 1 41 19 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:41:19 + and.b64 %rd46, %rd14, -4294967296; + setp.ne.b64 %p7, %rd46, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd59, %rd1, %rd14; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r35, %rd14; + div.u32 %r37, %r42, %r35; + cvt.u64.u32 %rd59, %r37; +$L__BB0_9: + .loc 1 23 21 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:23:21 + setp.lt.s32 %p9, %r42, %r10; + .loc 1 40 19 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:40:19 + mul.lo.s64 %rd48, %rd59, %rd14; + sub.s64 %rd49, %rd1, %rd48; + .loc 1 42 19 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:42:19 + cvt.u32.u64 %r38, %rd6; + .loc 1 43 49 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:49 + setp.lt.s64 %p10, %rd14, 2; + .loc 1 43 75 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:75 + setp.gt.s64 %p11, %rd14, 1; + .loc 1 43 66 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:66 + selp.b64 %rd50, %rd14, 0, %p11; + .loc 1 43 0 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43 + selp.b64 %rd51, 1, 0, %p10; + .loc 1 43 57 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:57 + add.s64 %rd52, %rd50, %rd51; + .loc 1 43 34 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:34 + mul.lo.s64 %rd53, %rd59, %rd52; + .loc 1 43 25 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:25 + shl.b64 %rd54, %rd49, 2; + add.s64 %rd55, %rd12, %rd54; + shl.b64 %rd56, %rd53, 2; + add.s64 %rd47, %rd55, %rd56; + .loc 1 43 88 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:88 + and.b32 %r40, %r2, 63; + setp.eq.b32 %p12, %r40, 0; + and.pred %p8, %p12, %p9; + // begin inline asm + @%p8 st.global.b32 [ %rd47 + 0 ], { %r38 }; + // end inline asm + .loc 1 43 4 // cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 216 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 51 +.b8 121 +.b8 120 +.b8 116 +.b8 121 +.b8 98 +.b8 102 +.b8 55 +.b8 52 +.b8 52 +.b8 115 +.b8 119 +.b8 109 +.b8 99 +.b8 112 +.b8 101 +.b8 50 +.b8 108 +.b8 118 +.b8 122 +.b8 55 +.b8 117 +.b8 120 +.b8 109 +.b8 102 +.b8 103 +.b8 108 +.b8 53 +.b8 97 +.b8 54 +.b8 107 +.b8 116 +.b8 52 +.b8 117 +.b8 112 +.b8 50 +.b8 99 +.b8 109 +.b8 120 +.b8 102 +.b8 51 +.b8 54 +.b8 121 +.b8 51 +.b8 114 +.b8 121 +.b8 97 +.b8 121 +.b8 97 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 98 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x22 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xad:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc2:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 39 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..78285cf7df3d595970efd04cdc98c6f0abfa1a3a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source @@ -0,0 +1,180 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc40 = loc(unknown) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc47 = loc("in_ptr0"(#loc)) +#loc48 = loc("out_ptr1"(#loc)) +#loc49 = loc("ks0"(#loc)) +#loc50 = loc("ks1"(#loc)) +#loc51 = loc("xnumel"(#loc)) +#loc52 = loc("r0_numel"(#loc)) +#loc79 = loc("input"(#loc38)) +#loc80 = loc("a"(#loc43)) +#loc81 = loc("b"(#loc43)) +module { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_0 = arith.constant 1 : i32 loc(#loc54) + %xoffset_1 = arith.constant 1 : i32 loc(#loc54) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc54) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc55) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc56) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc57) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc57) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc58) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1x1xi32> loc(#loc58) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc59) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc60) + %_tmp3 = arith.constant 0 : i64 loc(#loc61) + %_tmp3_8 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc61) + %c0_i32 = arith.constant 0 : i32 loc(#loc10) + %c16_i32 = arith.constant 16 : i32 loc(#loc10) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc10) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc10) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc10) + %3 = ub.poison : i32 loc(#loc10) + %_tmp3_9 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_8) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc63) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc63) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc64) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc64) + %tmp0 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc65) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x1xi64> loc(#loc65) + %tmp0_23 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc66) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc66) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x16xi64> loc(#loc66) + %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc67) + %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc67) + %tmp0_28 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc68) + %tmp0_29 = arith.andi %r0_mask_20, %tmp0_28 : tensor<1x16xi1> loc(#loc68) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc69) + %tmp0_31 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc69) + %tmp0_32 = arith.fptosi %tmp0_31 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc69) + %tmp0_33 = tt.load %tmp0_27, %tmp0_29, %tmp0_32 evictionPolicy = evict_first : tensor<1x16x!tt.ptr> loc(#loc69) + %tmp1 = arith.extsi %tmp0_33 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<1x16xi64> loc(#loc71) + %_tmp3_34 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc72) + %_tmp3_35 = arith.andi %r0_mask_20, %_tmp3_34 : tensor<1x16xi1> loc(#loc72) + %_tmp3_36 = arith.select %_tmp3_35, %tmp4, %_tmp3_18 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc73) + scf.yield %_tmp3_36 : tensor<1x16xi64> loc(#loc22) + } loc(#loc62) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_9) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc74) + %tmp3_10 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc75) + %x2 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc76) + %x2_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc76) + %x2_12 = arith.remsi %x2, %x2_11 : tensor<1x1xi64> loc(#loc76) + %x3 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc77) + %x3_13 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc77) + %x3_14 = arith.divsi %x3, %x3_13 : tensor<1x1xi64> loc(#loc77) + %tmp5 = arith.trunci %tmp3_10 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc78) + %c1_i32 = arith.constant 1 : i32 loc(#loc28) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc28) + %5 = arith.cmpi sge, %4, %ks1 : i64 loc(#loc28) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc29) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc29) + %6 = arith.extui %5 : i1 to i32 loc(#loc29) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc29) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc30) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc30) + %9 = arith.cmpi sgt, %ks1, %8 : i64 loc(#loc30) + %10 = arith.extui %9 : i1 to i64 loc(#loc31) + %11 = arith.muli %ks1, %10 : i64 loc(#loc31) + %12 = arith.extsi %7 : i32 to i64 loc(#loc32) + %13 = arith.addi %12, %11 : i64 loc(#loc32) + %14 = tt.splat %13 : i64 -> tensor<1x1xi64> loc(#loc33) + %15 = arith.muli %x3_14, %14 : tensor<1x1xi64> loc(#loc33) + %16 = arith.addi %x2_12, %15 : tensor<1x1xi64> loc(#loc34) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc35) + %18 = tt.addptr %17, %16 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc35) + tt.store %18, %tmp5, %xmask_6 : tensor<1x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc38))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc39) + tt.reduce.return %2 : i64 loc(#loc39) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc39) + tt.return %0 : tensor<1xi64> loc(#loc41) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc42) + tt.return %1 : tensor<1xi64> loc(#loc42) + } loc(#loc38) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc43)), %b: i64 loc("b"(#loc43))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc44) + tt.return %0 : i64 loc(#loc45) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc46) + tt.return %1 : i64 loc(#loc46) + } loc(#loc43) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":27:43) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":30:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":35:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":37:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:35) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:48) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:8) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":39:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":39:28) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":40:19) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":42:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:49) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:66) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:57) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:30) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:88) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc53 = loc("xoffset"(#loc1)) +#loc54 = loc("xoffset"(#loc2)) +#loc55 = loc("xindex"(#loc3)) +#loc56 = loc("xindex"(#loc4)) +#loc57 = loc("xindex"(#loc5)) +#loc58 = loc("xmask"(#loc6)) +#loc59 = loc("r0_base"(#loc7)) +#loc60 = loc("r0_base"(#loc8)) +#loc61 = loc("_tmp3"(#loc9)) +#loc62 = loc("_tmp3"(#loc10)) +#loc63 = loc("r0_index"(#loc11)) +#loc64 = loc("r0_mask"(#loc12)) +#loc65 = loc("tmp0"(#loc13)) +#loc66 = loc("tmp0"(#loc14)) +#loc67 = loc("tmp0"(#loc15)) +#loc68 = loc("tmp0"(#loc16)) +#loc69 = loc("tmp0"(#loc17)) +#loc70 = loc("tmp1"(#loc18)) +#loc71 = loc("tmp4"(#loc19)) +#loc72 = loc("_tmp3"(#loc20)) +#loc73 = loc("_tmp3"(#loc21)) +#loc74 = loc("tmp3"(#loc23)) +#loc75 = loc("tmp3"(#loc24)) +#loc76 = loc("x2"(#loc25)) +#loc77 = loc("x3"(#loc26)) +#loc78 = loc("tmp5"(#loc27)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..3b7bed80d09c14eb534d915eda9d0049baf3dc1d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir @@ -0,0 +1,123 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":39:25) +#loc34 = loc("in_ptr0"(#loc)) +#loc35 = loc("out_ptr1"(#loc)) +#loc36 = loc("ks0"(#loc)) +#loc37 = loc("ks1"(#loc)) +#loc38 = loc("xnumel"(#loc)) +#loc39 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp3"(#loc18)) +#loc63 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc40) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc41) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc42) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc42) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc43) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc44) + %tmp0_2 = arith.muli %ks0, %tmp0 : i64 loc(#loc44) + %tmp0_3 = tt.splat %tmp0_2 : i64 -> tensor<1x16xi64, #blocked> loc(#loc60) + %tmp0_4 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc46) + %tmp0_5 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_7 = %cst) -> (tensor<1x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32, #blocked> loc(#loc49) + %r0_index_8 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32, #blocked> loc(#loc49) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc43) + %tmp0_10 = arith.extsi %r0_index_8 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc45) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_3 : tensor<1x16xi64, #blocked> loc(#loc45) + %tmp0_12 = tt.addptr %tmp0_4, %tmp0_11 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi64, #blocked> loc(#loc46) + %tmp0_13 = arith.andi %r0_mask_9, %tmp0_5 : tensor<1x16xi1, #blocked> loc(#loc47) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 evictionPolicy = evict_first : tensor<1x16x!tt.ptr, #blocked> loc(#loc50) + %tmp1 = arith.extsi %tmp0_14 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc51) + %tmp4 = arith.addi %_tmp3_7, %tmp1 : tensor<1x16xi64, #blocked> loc(#loc52) + %_tmp3_15 = arith.select %tmp0_13, %tmp4, %_tmp3_7 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc53) + scf.yield %_tmp3_15 : tensor<1x16xi64, #blocked> loc(#loc16) + } loc(#loc48) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_7: i64 loc(callsite(#loc1 at #loc54)), %tmp3_8: i64 loc(callsite(#loc1 at #loc54))): + %tmp3_9 = arith.addi %tmp3_7, %tmp3_8 : i64 loc(#loc64) + tt.reduce.return %tmp3_9 : i64 loc(#loc62) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc62) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc20) + %tmp3_6 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc55) + %x2 = arith.remsi %tmp0, %ks1 : i64 loc(#loc56) + %x3 = arith.divsi %tmp0, %ks1 : i64 loc(#loc57) + %tmp5 = arith.trunci %tmp3_6 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc58) + %1 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc25) + %2 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc26) + %3 = arith.extui %2 : i1 to i64 loc(#loc27) + %4 = arith.muli %ks1, %3 : i64 loc(#loc27) + %5 = arith.extui %1 : i1 to i64 loc(#loc59) + %6 = arith.addi %5, %4 : i64 loc(#loc28) + %7 = arith.muli %x3, %6 : i64 loc(#loc30) + %8 = arith.addi %x2, %7 : i64 loc(#loc31) + %9 = tt.addptr %out_ptr1, %8 : !tt.ptr, i64 loc(#loc32) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc20) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc20) + tt.store %10, %tmp5, %11 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc20) + tt.return loc(#loc33) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":24:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":30:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":35:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":37:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:88) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":39:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:49) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:75) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:66) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:57) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:30) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:4) +#loc40 = loc("xoffset"(#loc2)) +#loc41 = loc("xmask"(#loc3)) +#loc42 = loc("r0_base"(#loc4)) +#loc43 = loc("r0_mask"(#loc5)) +#loc44 = loc("tmp0"(#loc6)) +#loc45 = loc("tmp0"(#loc7)) +#loc46 = loc("tmp0"(#loc8)) +#loc47 = loc("tmp0"(#loc9)) +#loc48 = loc("_tmp3"(#loc10)) +#loc49 = loc("r0_index"(#loc11)) +#loc50 = loc("tmp0"(#loc12)) +#loc51 = loc("tmp1"(#loc13)) +#loc52 = loc("tmp4"(#loc14)) +#loc53 = loc("_tmp3"(#loc15)) +#loc55 = loc("tmp3"(#loc21)) +#loc56 = loc("x2"(#loc22)) +#loc57 = loc("x3"(#loc23)) +#loc58 = loc("tmp5"(#loc24)) +#loc59 = loc(fused[#loc28, #loc29]) +#loc60 = loc(fused[#loc45, #loc44]) +#loc61 = loc(fused[#loc47, #loc41]) +#loc62 = loc(callsite(#loc17 at #loc54)) +#loc64 = loc(callsite(#loc19 at #loc62)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f58031fdc6422a2866a707b10a8e46a46e1f2d27 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir @@ -0,0 +1,125 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":39:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("out_ptr1"(#loc)) +#loc38 = loc("ks0"(#loc)) +#loc39 = loc("ks1"(#loc)) +#loc40 = loc("xnumel"(#loc)) +#loc41 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp3"(#loc20)) +#loc67 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc42) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc44) + %xmask_0 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc44) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc45) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc46) + %_tmp3_2 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_5 = %_tmp3) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc48) + %r0_index_6 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32> loc(#loc48) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc49) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x16xi32> loc(#loc49) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %tmp0_8 = arith.muli %ks0, %tmp0 : i64 loc(#loc50) + %tmp0_9 = arith.extsi %r0_index_6 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc51) + %tmp0_10 = tt.splat %tmp0_8 : i64 -> tensor<1x16xi64> loc(#loc64) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x16xi64> loc(#loc51) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc52) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc52) + %tmp0_14 = tt.splat %xmask : i1 -> tensor<1x16xi1> loc(#loc65) + %tmp0_15 = arith.andi %r0_mask_7, %tmp0_14 : tensor<1x16xi1> loc(#loc53) + %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst evictionPolicy = evict_first : tensor<1x16x!tt.ptr> loc(#loc54) + %tmp1 = arith.extsi %tmp0_16 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc55) + %tmp4 = arith.addi %_tmp3_5, %tmp1 : tensor<1x16xi64> loc(#loc56) + %_tmp3_17 = arith.select %tmp0_15, %tmp4, %_tmp3_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc57) + scf.yield %_tmp3_17 : tensor<1x16xi64> loc(#loc18) + } loc(#loc47) + %tmp3 = "tt.reduce"(%_tmp3_2) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_5: i64 loc(callsite(#loc1 at #loc58)), %tmp3_6: i64 loc(callsite(#loc1 at #loc58))): + %tmp3_7 = arith.addi %tmp3_5, %tmp3_6 : i64 loc(#loc68) + tt.reduce.return %tmp3_7 : i64 loc(#loc66) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc66) + %tmp3_3 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc59) + %x2 = arith.extsi %xoffset : i32 to i64 loc(#loc60) + %x2_4 = arith.remsi %x2, %ks1 : i64 loc(#loc60) + %x3 = arith.divsi %x2, %ks1 : i64 loc(#loc61) + %tmp5 = arith.trunci %tmp3_3 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc62) + %0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc26) + %1 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc27) + %2 = arith.extui %1 : i1 to i64 loc(#loc28) + %3 = arith.muli %ks1, %2 : i64 loc(#loc28) + %4 = arith.extui %0 : i1 to i64 loc(#loc63) + %5 = arith.addi %4, %3 : i64 loc(#loc29) + %6 = arith.muli %x3, %5 : i64 loc(#loc31) + %7 = arith.addi %x2_4, %6 : i64 loc(#loc32) + %8 = tt.addptr %out_ptr1, %7 : !tt.ptr, i64 loc(#loc33) + %9 = tt.splat %8 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc33) + tt.store %9, %tmp5, %xmask_0 : tensor<1x1x!tt.ptr> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":28:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":27:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":23:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":24:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":29:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":30:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:60) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":34:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":35:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":37:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:48) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":38:8) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":39:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":40:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":41:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":42:19) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:49) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:75) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:57) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:34) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:30) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:88) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/b3/cb3yxtybf744swmcpe2lvz7uxmfgl5a6kt4up2cmxf36y3ryayam.py":43:4) +#loc42 = loc("_tmp3"(#loc3)) +#loc43 = loc("xoffset"(#loc4)) +#loc44 = loc("xmask"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("r0_base"(#loc7)) +#loc47 = loc("_tmp3"(#loc2)) +#loc48 = loc("r0_index"(#loc8)) +#loc49 = loc("r0_mask"(#loc9)) +#loc50 = loc("tmp0"(#loc10)) +#loc51 = loc("tmp0"(#loc11)) +#loc52 = loc("tmp0"(#loc12)) +#loc53 = loc("tmp0"(#loc13)) +#loc54 = loc("tmp0"(#loc14)) +#loc55 = loc("tmp1"(#loc15)) +#loc56 = loc("tmp4"(#loc16)) +#loc57 = loc("_tmp3"(#loc17)) +#loc59 = loc("tmp3"(#loc22)) +#loc60 = loc("x2"(#loc23)) +#loc61 = loc("x3"(#loc24)) +#loc62 = loc("tmp5"(#loc25)) +#loc63 = loc(fused[#loc29, #loc30]) +#loc64 = loc(fused[#loc51, #loc50]) +#loc65 = loc(fused[#loc53, #loc44]) +#loc66 = loc(callsite(#loc19 at #loc58)) +#loc68 = loc(callsite(#loc21 at #loc66)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json new file mode 100644 index 0000000000000000000000000000000000000000..91640192862189bbb39c8de4e1165ec71170caac --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_6.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source", "triton_poi_fused__to_copy_6.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir", "triton_poi_fused__to_copy_6.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir", "triton_poi_fused__to_copy_6.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir", "triton_poi_fused__to_copy_6.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx", "triton_poi_fused__to_copy_6.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin", "triton_poi_fused__to_copy_6.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin new file mode 100644 index 0000000000000000000000000000000000000000..f1c4ba697fc3646d1551c00ca937717f3ac300cf Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json new file mode 100644 index 0000000000000000000000000000000000000000..29ce4d662ad54d9abe8962c008aade5f52f52936 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json @@ -0,0 +1 @@ +{"hash": "42bb19ed265ccbd7b1943ef4d4583a35db4f47c232332144f4da3ddc335af286", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_6"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir new file mode 100644 index 0000000000000000000000000000000000000000..d174c8bb3b49c5416f3d8186484e6e3adef7661b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir @@ -0,0 +1,89 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__to_copy_6(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i64 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 7, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 127, !dbg !9 + %13 = or disjoint i32 %10, %12, !dbg !10 + %14 = icmp slt i32 %13, %5, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %.frozen = freeze i64 %2, !dbg !13 + %16 = sdiv i64 %15, %.frozen, !dbg !13 + %17 = mul i64 %16, %.frozen, !dbg !12 + %.decomposed = sub i64 %15, %17, !dbg !12 + %18 = srem i64 %16, %3, !dbg !14 + %19 = sdiv i64 %15, %4, !dbg !15 + %20 = insertelement <2 x i64> poison, i64 %3, i64 0, !dbg !16 + %21 = insertelement <2 x i64> %20, i64 %2, i64 1, !dbg !16 + %22 = icmp slt <2 x i64> %21, splat (i64 2), !dbg !16 + %23 = icmp sgt <2 x i64> %21, splat (i64 1), !dbg !17 + %24 = select <2 x i1> %23, <2 x i64> %21, <2 x i64> zeroinitializer, !dbg !18 + %25 = zext <2 x i1> %22 to <2 x i64>, !dbg !19 + %26 = add <2 x i64> %24, %25, !dbg !20 + %27 = extractelement <2 x i64> %26, i64 0, !dbg !21 + %28 = mul i64 %.decomposed, %27, !dbg !22 + %29 = extractelement <2 x i64> %26, i64 1, !dbg !21 + %30 = mul i64 %27, %29, !dbg !21 + %31 = mul i64 %30, %19, !dbg !23 + %32 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !24 + %33 = getelementptr i64, ptr addrspace(1) %32, i64 %28, !dbg !24 + %34 = getelementptr i64, ptr addrspace(1) %33, i64 %31, !dbg !24 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !25 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %34, i64 %35, i1 %14) #2, !dbg !25 + %37 = trunc i64 %36 to i32, !dbg !26 + %38 = mul i64 %18, %29, !dbg !27 + %39 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !28 + %40 = getelementptr i32, ptr addrspace(1) %39, i64 %38, !dbg !28 + %41 = getelementptr i32, ptr addrspace(1) %40, i64 %31, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %37, ptr addrspace(1) %41, i1 %14) #2, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_6", linkageName: "triton_poi_fused__to_copy_6", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 22, column: 19, scope: !4) +!13 = !DILocation(line: 23, column: 21, scope: !4) +!14 = !DILocation(line: 23, column: 28, scope: !4) +!15 = !DILocation(line: 24, column: 19, scope: !4) +!16 = !DILocation(line: 25, column: 54, scope: !4) +!17 = !DILocation(line: 25, column: 80, scope: !4) +!18 = !DILocation(line: 25, column: 71, scope: !4) +!19 = !DILocation(line: 25, scope: !4) +!20 = !DILocation(line: 25, column: 62, scope: !4) +!21 = !DILocation(line: 25, column: 91, scope: !4) +!22 = !DILocation(line: 25, column: 39, scope: !4) +!23 = !DILocation(line: 25, column: 138, scope: !4) +!24 = !DILocation(line: 25, column: 30, scope: !4) +!25 = !DILocation(line: 25, column: 186, scope: !4) +!26 = !DILocation(line: 26, column: 19, scope: !4) +!27 = !DILocation(line: 27, column: 34, scope: !4) +!28 = !DILocation(line: 27, column: 25, scope: !4) +!29 = !DILocation(line: 27, column: 187, scope: !4) +!30 = !DILocation(line: 27, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx new file mode 100644 index 0000000000000000000000000000000000000000..5fb222d41f3fdba7d14b83ef2107178b5e2f4b23 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx @@ -0,0 +1,311 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused__to_copy_6 // -- Begin function triton_poi_fused__to_copy_6 + // @triton_poi_fused__to_copy_6 +.visible .entry triton_poi_fused__to_copy_6( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_1, + .param .u64 triton_poi_fused__to_copy_6_param_2, + .param .u64 triton_poi_fused__to_copy_6_param_3, + .param .u64 triton_poi_fused__to_copy_6_param_4, + .param .u32 triton_poi_fused__to_copy_6_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_6, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_7 +) +.reqntid 128 +{ + .reg .pred %p<10>; + .reg .b32 %r<20>; + .reg .b64 %rd<54>; + .loc 1 18 0 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:18:0 + +// %bb.0: + ld.param.b64 %rd16, [triton_poi_fused__to_copy_6_param_3]; + ld.param.b64 %rd15, [triton_poi_fused__to_copy_6_param_2]; +$L__tmp0: + .loc 1 19 28 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:19:33 + shl.b32 %r3, %r2, 7; + .loc 1 20 36 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:20:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 20 23 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:20:23 + or.b32 %r6, %r3, %r5; + .loc 1 22 19 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:22:19 + cvt.s64.s32 %rd1, %r6; + .loc 1 23 21 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:23:21 + or.b64 %rd19, %rd1, %rd15; + and.b64 %rd20, %rd19, -4294967296; + setp.ne.b64 %p1, %rd20, 0; + cvt.u32.u64 %r19, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd51, %rd1, %rd15; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r7, %rd15; + div.u32 %r9, %r19, %r7; + cvt.u64.u32 %rd51, %r9; +$L__BB0_3: + .loc 1 0 21 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:0:21 + ld.param.b64 %rd17, [triton_poi_fused__to_copy_6_param_4]; + .loc 1 22 19 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:22:19 + mul.lo.s64 %rd21, %rd51, %rd15; + .loc 1 23 28 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:23:28 + or.b64 %rd22, %rd51, %rd16; + and.b64 %rd23, %rd22, -4294967296; + setp.ne.b64 %p2, %rd23, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd52, %rd51, %rd16; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r10, %rd16; + cvt.u32.u64 %r11, %rd51; + rem.u32 %r12, %r11, %r10; + cvt.u64.u32 %rd52, %r12; +$L__BB0_6: + .loc 1 0 28 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:0:28 + ld.param.b32 %r1, [triton_poi_fused__to_copy_6_param_5]; + ld.param.b64 %rd14, [triton_poi_fused__to_copy_6_param_1]; + ld.param.b64 %rd13, [triton_poi_fused__to_copy_6_param_0]; + sub.s64 %rd6, %rd1, %rd21; + .loc 1 24 19 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:24:19 + or.b64 %rd24, %rd1, %rd17; + and.b64 %rd25, %rd24, -4294967296; + setp.ne.b64 %p3, %rd25, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd53, %rd1, %rd17; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r13, %rd17; + div.u32 %r15, %r19, %r13; + cvt.u64.u32 %rd53, %r15; +$L__BB0_9: + .loc 1 21 21 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:21:21 + setp.lt.s32 %p4, %r19, %r1; + .loc 1 25 54 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:54 + setp.lt.s64 %p6, %rd15, 2; + setp.lt.s64 %p7, %rd16, 2; + .loc 1 25 80 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:80 + setp.gt.s64 %p8, %rd15, 1; + setp.gt.s64 %p9, %rd16, 1; + .loc 1 25 71 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:71 + selp.b64 %rd31, %rd16, 0, %p9; + selp.b64 %rd32, %rd15, 0, %p8; + .loc 1 25 0 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25 + selp.b64 %rd33, 1, 0, %p7; + selp.b64 %rd34, 1, 0, %p6; + .loc 1 25 62 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:62 + add.s64 %rd35, %rd32, %rd34; + add.s64 %rd36, %rd31, %rd33; + .loc 1 25 39 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:39 + mul.lo.s64 %rd37, %rd6, %rd36; + .loc 1 25 91 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:91 + mul.lo.s64 %rd38, %rd36, %rd35; + .loc 1 25 138 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:138 + mul.lo.s64 %rd39, %rd38, %rd53; + .loc 1 25 30 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:30 + shl.b64 %rd40, %rd52, 3; + add.s64 %rd41, %rd13, %rd40; + shl.b64 %rd42, %rd37, 3; + add.s64 %rd43, %rd41, %rd42; + shl.b64 %rd44, %rd39, 3; + add.s64 %rd28, %rd43, %rd44; + .loc 1 25 186 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:25:186 + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd27 }, [ %rd28 + 0 ], %rd29; + // end inline asm + .loc 1 26 19 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:26:19 + cvt.u32.u64 %r16, %rd27; + .loc 1 27 34 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:27:34 + mul.lo.s64 %rd45, %rd52, %rd35; + .loc 1 27 25 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:27:25 + shl.b64 %rd46, %rd6, 2; + add.s64 %rd47, %rd14, %rd46; + shl.b64 %rd48, %rd45, 2; + add.s64 %rd49, %rd47, %rd48; + shl.b64 %rd50, %rd39, 2; + add.s64 %rd30, %rd49, %rd50; + .loc 1 27 187 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:27:187 + // begin inline asm + @%p4 st.global.b32 [ %rd30 + 0 ], { %r16 }; + // end inline asm + .loc 1 27 4 // cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py:27:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 107 +.b8 119 +.b8 113 +.b8 100 +.b8 98 +.b8 122 +.b8 114 +.b8 111 +.b8 109 +.b8 116 +.b8 55 +.b8 108 +.b8 114 +.b8 105 +.b8 100 +.b8 51 +.b8 111 +.b8 53 +.b8 113 +.b8 106 +.b8 119 +.b8 115 +.b8 112 +.b8 101 +.b8 101 +.b8 110 +.b8 116 +.b8 97 +.b8 103 +.b8 106 +.b8 116 +.b8 102 +.b8 111 +.b8 106 +.b8 111 +.b8 51 +.b8 119 +.b8 115 +.b8 55 +.b8 106 +.b8 107 +.b8 105 +.b8 105 +.b8 105 +.b8 109 +.b8 122 +.b8 117 +.b8 108 +.b8 119 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 101 +.b8 107 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source new file mode 100644 index 0000000000000000000000000000000000000000..62b11bdf7866d4db214d06eefe9e9f2ff20899c6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source @@ -0,0 +1,226 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":18:0) +#loc56 = loc("in_ptr0"(#loc)) +#loc57 = loc("out_ptr0"(#loc)) +#loc58 = loc("ks0"(#loc)) +#loc59 = loc("ks1"(#loc)) +#loc60 = loc("ks2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %xoffset_0 = arith.constant 128 : i32 loc(#loc63) + %xoffset_1 = arith.constant 128 : i32 loc(#loc63) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc63) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc64) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc65) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc65) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc66) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc66) + %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc67) + %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc67) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc67) + %x1 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc68) + %x1_8 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc68) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<128xi64> loc(#loc68) + %x1_10 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc69) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<128xi64> loc(#loc69) + %x2 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc70) + %x2_12 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc70) + %x2_13 = arith.divsi %x2, %x2_12 : tensor<128xi64> loc(#loc70) + %tmp0 = arith.constant 1 : i32 loc(#loc71) + %tmp0_14 = arith.extsi %tmp0 : i32 to i64 loc(#loc71) + %tmp0_15 = arith.cmpi sge, %tmp0_14, %ks1 : i64 loc(#loc71) + %tmp0_16 = arith.constant 1 : i32 loc(#loc72) + %tmp0_17 = arith.constant 1 : i32 loc(#loc72) + %tmp0_18 = arith.extui %tmp0_15 : i1 to i32 loc(#loc72) + %tmp0_19 = arith.muli %tmp0_17, %tmp0_18 : i32 loc(#loc72) + %tmp0_20 = arith.constant 1 : i32 loc(#loc73) + %tmp0_21 = arith.extsi %tmp0_20 : i32 to i64 loc(#loc73) + %tmp0_22 = arith.cmpi sgt, %ks1, %tmp0_21 : i64 loc(#loc73) + %tmp0_23 = arith.extui %tmp0_22 : i1 to i64 loc(#loc74) + %tmp0_24 = arith.muli %ks1, %tmp0_23 : i64 loc(#loc74) + %tmp0_25 = arith.extsi %tmp0_19 : i32 to i64 loc(#loc75) + %tmp0_26 = arith.addi %tmp0_25, %tmp0_24 : i64 loc(#loc75) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<128xi64> loc(#loc76) + %tmp0_28 = arith.muli %x0_7, %tmp0_27 : tensor<128xi64> loc(#loc76) + %tmp0_29 = arith.addi %x1_11, %tmp0_28 : tensor<128xi64> loc(#loc77) + %tmp0_30 = arith.constant 1 : i32 loc(#loc78) + %tmp0_31 = arith.extsi %tmp0_30 : i32 to i64 loc(#loc78) + %tmp0_32 = arith.cmpi sge, %tmp0_31, %ks0 : i64 loc(#loc78) + %tmp0_33 = arith.constant 1 : i32 loc(#loc79) + %tmp0_34 = arith.constant 1 : i32 loc(#loc79) + %tmp0_35 = arith.extui %tmp0_32 : i1 to i32 loc(#loc79) + %tmp0_36 = arith.muli %tmp0_34, %tmp0_35 : i32 loc(#loc79) + %tmp0_37 = arith.constant 1 : i32 loc(#loc80) + %tmp0_38 = arith.extsi %tmp0_37 : i32 to i64 loc(#loc80) + %tmp0_39 = arith.cmpi sgt, %ks0, %tmp0_38 : i64 loc(#loc80) + %tmp0_40 = arith.extui %tmp0_39 : i1 to i64 loc(#loc81) + %tmp0_41 = arith.muli %ks0, %tmp0_40 : i64 loc(#loc81) + %tmp0_42 = arith.extsi %tmp0_36 : i32 to i64 loc(#loc82) + %tmp0_43 = arith.addi %tmp0_42, %tmp0_41 : i64 loc(#loc82) + %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<128xi64> loc(#loc83) + %tmp0_45 = arith.muli %x2_13, %tmp0_44 : tensor<128xi64> loc(#loc83) + %tmp0_46 = arith.constant 1 : i32 loc(#loc84) + %tmp0_47 = arith.extsi %tmp0_46 : i32 to i64 loc(#loc84) + %tmp0_48 = arith.cmpi sge, %tmp0_47, %ks1 : i64 loc(#loc84) + %tmp0_49 = arith.constant 1 : i32 loc(#loc85) + %tmp0_50 = arith.constant 1 : i32 loc(#loc85) + %tmp0_51 = arith.extui %tmp0_48 : i1 to i32 loc(#loc85) + %tmp0_52 = arith.muli %tmp0_50, %tmp0_51 : i32 loc(#loc85) + %tmp0_53 = arith.constant 1 : i32 loc(#loc86) + %tmp0_54 = arith.extsi %tmp0_53 : i32 to i64 loc(#loc86) + %tmp0_55 = arith.cmpi sgt, %ks1, %tmp0_54 : i64 loc(#loc86) + %tmp0_56 = arith.extui %tmp0_55 : i1 to i64 loc(#loc87) + %tmp0_57 = arith.muli %ks1, %tmp0_56 : i64 loc(#loc87) + %tmp0_58 = arith.extsi %tmp0_52 : i32 to i64 loc(#loc88) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_57 : i64 loc(#loc88) + %tmp0_60 = tt.splat %tmp0_59 : i64 -> tensor<128xi64> loc(#loc89) + %tmp0_61 = arith.muli %tmp0_45, %tmp0_60 : tensor<128xi64> loc(#loc89) + %tmp0_62 = arith.addi %tmp0_29, %tmp0_61 : tensor<128xi64> loc(#loc90) + %tmp0_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc91) + %tmp0_64 = tt.addptr %tmp0_63, %tmp0_62 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc91) + %tmp0_65 = tt.load %tmp0_64, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc92) + %tmp1 = arith.trunci %tmp0_65 : tensor<128xi64> to tensor<128xi32> loc(#loc93) + %c1_i32 = arith.constant 1 : i32 loc(#loc33) + %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc33) + %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc33) + %c1_i32_66 = arith.constant 1 : i32 loc(#loc34) + %c1_i32_67 = arith.constant 1 : i32 loc(#loc34) + %2 = arith.extui %1 : i1 to i32 loc(#loc34) + %3 = arith.muli %c1_i32_67, %2 : i32 loc(#loc34) + %c1_i32_68 = arith.constant 1 : i32 loc(#loc35) + %4 = arith.extsi %c1_i32_68 : i32 to i64 loc(#loc35) + %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc35) + %6 = arith.extui %5 : i1 to i64 loc(#loc36) + %7 = arith.muli %ks0, %6 : i64 loc(#loc36) + %8 = arith.extsi %3 : i32 to i64 loc(#loc37) + %9 = arith.addi %8, %7 : i64 loc(#loc37) + %10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc38) + %11 = arith.muli %x1_11, %10 : tensor<128xi64> loc(#loc38) + %12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc39) + %c1_i32_69 = arith.constant 1 : i32 loc(#loc40) + %13 = arith.extsi %c1_i32_69 : i32 to i64 loc(#loc40) + %14 = arith.cmpi sge, %13, %ks0 : i64 loc(#loc40) + %c1_i32_70 = arith.constant 1 : i32 loc(#loc41) + %c1_i32_71 = arith.constant 1 : i32 loc(#loc41) + %15 = arith.extui %14 : i1 to i32 loc(#loc41) + %16 = arith.muli %c1_i32_71, %15 : i32 loc(#loc41) + %c1_i32_72 = arith.constant 1 : i32 loc(#loc42) + %17 = arith.extsi %c1_i32_72 : i32 to i64 loc(#loc42) + %18 = arith.cmpi sgt, %ks0, %17 : i64 loc(#loc42) + %19 = arith.extui %18 : i1 to i64 loc(#loc43) + %20 = arith.muli %ks0, %19 : i64 loc(#loc43) + %21 = arith.extsi %16 : i32 to i64 loc(#loc44) + %22 = arith.addi %21, %20 : i64 loc(#loc44) + %23 = tt.splat %22 : i64 -> tensor<128xi64> loc(#loc45) + %24 = arith.muli %x2_13, %23 : tensor<128xi64> loc(#loc45) + %c1_i32_73 = arith.constant 1 : i32 loc(#loc46) + %25 = arith.extsi %c1_i32_73 : i32 to i64 loc(#loc46) + %26 = arith.cmpi sge, %25, %ks1 : i64 loc(#loc46) + %c1_i32_74 = arith.constant 1 : i32 loc(#loc47) + %c1_i32_75 = arith.constant 1 : i32 loc(#loc47) + %27 = arith.extui %26 : i1 to i32 loc(#loc47) + %28 = arith.muli %c1_i32_75, %27 : i32 loc(#loc47) + %c1_i32_76 = arith.constant 1 : i32 loc(#loc48) + %29 = arith.extsi %c1_i32_76 : i32 to i64 loc(#loc48) + %30 = arith.cmpi sgt, %ks1, %29 : i64 loc(#loc48) + %31 = arith.extui %30 : i1 to i64 loc(#loc49) + %32 = arith.muli %ks1, %31 : i64 loc(#loc49) + %33 = arith.extsi %28 : i32 to i64 loc(#loc50) + %34 = arith.addi %33, %32 : i64 loc(#loc50) + %35 = tt.splat %34 : i64 -> tensor<128xi64> loc(#loc51) + %36 = arith.muli %24, %35 : tensor<128xi64> loc(#loc51) + %37 = arith.addi %12, %36 : tensor<128xi64> loc(#loc52) + %38 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc53) + %39 = tt.addptr %38, %37 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc53) + tt.store %39, %tmp1, %xmask_5 : tensor<128x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:54) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:46) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:39) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:106) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:98) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:91) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:153) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:145) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:179) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:170) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:161) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:138) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:186) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":26:19) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:75) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:66) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:57) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:34) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:101) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:127) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:118) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:109) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:86) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:148) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:140) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:174) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:165) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:156) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:133) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:82) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:187) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:4) +#loc62 = loc("xoffset"(#loc1)) +#loc63 = loc("xoffset"(#loc2)) +#loc64 = loc("xindex"(#loc3)) +#loc65 = loc("xindex"(#loc4)) +#loc66 = loc("xmask"(#loc5)) +#loc67 = loc("x0"(#loc6)) +#loc68 = loc("x1"(#loc7)) +#loc69 = loc("x1"(#loc8)) +#loc70 = loc("x2"(#loc9)) +#loc71 = loc("tmp0"(#loc10)) +#loc72 = loc("tmp0"(#loc11)) +#loc73 = loc("tmp0"(#loc12)) +#loc74 = loc("tmp0"(#loc13)) +#loc75 = loc("tmp0"(#loc14)) +#loc76 = loc("tmp0"(#loc15)) +#loc77 = loc("tmp0"(#loc16)) +#loc78 = loc("tmp0"(#loc17)) +#loc79 = loc("tmp0"(#loc18)) +#loc80 = loc("tmp0"(#loc19)) +#loc81 = loc("tmp0"(#loc20)) +#loc82 = loc("tmp0"(#loc21)) +#loc83 = loc("tmp0"(#loc22)) +#loc84 = loc("tmp0"(#loc23)) +#loc85 = loc("tmp0"(#loc24)) +#loc86 = loc("tmp0"(#loc25)) +#loc87 = loc("tmp0"(#loc26)) +#loc88 = loc("tmp0"(#loc27)) +#loc89 = loc("tmp0"(#loc28)) +#loc90 = loc("tmp0"(#loc29)) +#loc91 = loc("tmp0"(#loc30)) +#loc92 = loc("tmp0"(#loc31)) +#loc93 = loc("tmp1"(#loc32)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fdc07153c42611b4e47d5b6e43d4a30d64d9d121 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir @@ -0,0 +1,122 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":18:0) +#loc35 = loc("in_ptr0"(#loc)) +#loc36 = loc("out_ptr0"(#loc)) +#loc37 = loc("ks0"(#loc)) +#loc38 = loc("ks1"(#loc)) +#loc39 = loc("ks2"(#loc)) +#loc40 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc43) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32, #blocked> loc(#loc44) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32, #blocked> loc(#loc44) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc45) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32, #blocked> loc(#loc45) + %x0 = arith.extsi %xindex_2 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc46) + %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc46) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc46) + %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc47) + %x1_6 = tt.splat %ks1 : i64 -> tensor<128xi64, #blocked> loc(#loc48) + %x1_7 = arith.remsi %x1, %x1_6 : tensor<128xi64, #blocked> loc(#loc48) + %x2 = tt.splat %ks2 : i64 -> tensor<128xi64, #blocked> loc(#loc49) + %x2_8 = arith.divsi %x0, %x2 : tensor<128xi64, #blocked> loc(#loc49) + %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50) + %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51) + %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52) + %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52) + %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<128xi64, #blocked> loc(#loc55) + %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<128xi64, #blocked> loc(#loc55) + %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<128xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57) + %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58) + %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59) + %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59) + %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69) + %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60) + %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<128xi64, #blocked> loc(#loc62) + %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<128xi64, #blocked> loc(#loc62) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<128xi64, #blocked> loc(#loc63) + %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<128xi64, #blocked> loc(#loc64) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc65) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc65) + %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr, #blocked> loc(#loc66) + %tmp1 = arith.trunci %tmp0_29 : tensor<128xi64, #blocked> to tensor<128xi32, #blocked> loc(#loc67) + %0 = arith.muli %x1_7, %tmp0_23 : tensor<128xi64, #blocked> loc(#loc29) + %1 = arith.addi %x0_5, %0 : tensor<128xi64, #blocked> loc(#loc30) + %2 = arith.addi %1, %tmp0_25 : tensor<128xi64, #blocked> loc(#loc31) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc32) + %4 = tt.addptr %3, %2 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc32) + tt.store %4, %tmp1, %xmask_3 : tensor<128x!tt.ptr, #blocked> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:98) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":26:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:82) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:187) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("xmask"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("tmp0"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp0"(#loc21)) +#loc61 = loc("tmp0"(#loc22)) +#loc62 = loc("tmp0"(#loc23)) +#loc63 = loc("tmp0"(#loc24)) +#loc64 = loc("tmp0"(#loc25)) +#loc65 = loc("tmp0"(#loc26)) +#loc66 = loc("tmp0"(#loc27)) +#loc67 = loc("tmp1"(#loc28)) +#loc68 = loc(fused[#loc53, #loc54]) +#loc69 = loc(fused[#loc60, #loc61]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir new file mode 100644 index 0000000000000000000000000000000000000000..ba9d01c5416668ef03061d43cb96d707bb128a6f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir @@ -0,0 +1,121 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":18:0) +#loc35 = loc("in_ptr0"(#loc)) +#loc36 = loc("out_ptr0"(#loc)) +#loc37 = loc("ks0"(#loc)) +#loc38 = loc("ks1"(#loc)) +#loc39 = loc("ks2"(#loc)) +#loc40 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc43) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc44) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc44) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc45) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc45) + %x0 = arith.extsi %xindex_2 : tensor<128xi32> to tensor<128xi64> loc(#loc46) + %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc46) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64> loc(#loc46) + %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64> loc(#loc47) + %x1_6 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc48) + %x1_7 = arith.remsi %x1, %x1_6 : tensor<128xi64> loc(#loc48) + %x2 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc49) + %x2_8 = arith.divsi %x0, %x2 : tensor<128xi64> loc(#loc49) + %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50) + %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51) + %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52) + %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52) + %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<128xi64> loc(#loc55) + %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<128xi64> loc(#loc55) + %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<128xi64> loc(#loc56) + %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57) + %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58) + %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59) + %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59) + %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69) + %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60) + %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<128xi64> loc(#loc62) + %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<128xi64> loc(#loc62) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<128xi64> loc(#loc63) + %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<128xi64> loc(#loc64) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc65) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc65) + %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc66) + %tmp1 = arith.trunci %tmp0_29 : tensor<128xi64> to tensor<128xi32> loc(#loc67) + %0 = arith.muli %x1_7, %tmp0_23 : tensor<128xi64> loc(#loc29) + %1 = arith.addi %x0_5, %0 : tensor<128xi64> loc(#loc30) + %2 = arith.addi %1, %tmp0_25 : tensor<128xi64> loc(#loc31) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc32) + %4 = tt.addptr %3, %2 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc32) + tt.store %4, %tmp1, %xmask_3 : tensor<128x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:98) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":25:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":26:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:82) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:187) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ek/cekwqdbzromt7lrid3o5qjwspeentagjtfojo3ws7jkiiimzulw7.py":27:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("xmask"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("tmp0"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp0"(#loc21)) +#loc61 = loc("tmp0"(#loc22)) +#loc62 = loc("tmp0"(#loc23)) +#loc63 = loc("tmp0"(#loc24)) +#loc64 = loc("tmp0"(#loc25)) +#loc65 = loc("tmp0"(#loc26)) +#loc66 = loc("tmp0"(#loc27)) +#loc67 = loc("tmp1"(#loc28)) +#loc68 = loc(fused[#loc53, #loc54]) +#loc69 = loc(fused[#loc60, #loc61]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/__grp__triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/__grp__triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c68be63efe2e539697c845611963c9302cd4084b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/__grp__triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ba38aea63986dd935559cd6257ae5fd61fa30466 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"hash": "44abae1d80e278b52aaed68d86f55cdd8094ad56c3e3b5b8912236a5a5d51e7b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..c76c0fd0442cde979da47b4cd5c24d396a51ecd2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.llir @@ -0,0 +1,14439 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_zeros_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, i32 %18, i32 %19, i32 %20, i32 %21, i32 %22, i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) readnone captures(none) %27, ptr addrspace(1) readnone captures(none) %28) local_unnamed_addr #0 !dbg !5 { + %30 = shl i32 %18, 12, !dbg !8 + %31 = shl i32 %19, 10, !dbg !9 + %32 = shl i32 %19, 7, !dbg !10 + %33 = icmp slt i32 %18, 2, !dbg !11 + %34 = zext i1 %33 to i32, !dbg !12 + %35 = icmp sgt i32 %18, 1, !dbg !13 + %36 = select i1 %35, i32 %18, i32 0, !dbg !14 + %37 = add i32 %36, %34, !dbg !15 + %38 = shl i32 %37, 12, !dbg !16 + %39 = shl i32 %37, 7, !dbg !17 + %40 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !18 + %41 = add i32 %19, 127, !dbg !19 + %42 = sdiv i32 %41, 128, !dbg !23 + %43 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !24 + %44 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !25 + %45 = and i32 %43, 7, !dbg !26 + %46 = mul i32 %32, %44, !dbg !27 + %47 = mul i32 %31, %45, !dbg !28 + %48 = add i32 %46, %47, !dbg !29 + %49 = sext i32 %48 to i64, !dbg !30 + %50 = mul i32 %31, %43, !dbg !31 + %51 = add i32 %46, %50, !dbg !32 + %52 = sext i32 %51 to i64, !dbg !33 + %53 = getelementptr bfloat, ptr addrspace(1) %1, i64 %49, !dbg !34 + %54 = getelementptr bfloat, ptr addrspace(1) %2, i64 %49, !dbg !35 + %55 = getelementptr bfloat, ptr addrspace(1) %7, i64 %52, !dbg !36 + %56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !37 + %57 = lshr i32 %56, 5, !dbg !37 + %58 = and i32 %56, 240, !dbg !37 + %59 = lshr exact i32 %58, 4, !dbg !37 + %60 = or disjoint i32 %59, 16, !dbg !37 + %61 = or disjoint i32 %59, 32, !dbg !37 + %62 = or disjoint i32 %59, 48, !dbg !37 + %63 = or disjoint i32 %59, 64, !dbg !37 + %64 = or disjoint i32 %59, 80, !dbg !37 + %65 = or disjoint i32 %59, 96, !dbg !37 + %66 = or disjoint i32 %59, 112, !dbg !37 + %67 = lshr i32 %56, 1, !dbg !37 + %68 = and i32 %67, 112, !dbg !37 + %69 = lshr i32 %56, 2, !dbg !37 + %70 = and i32 %69, 7, !dbg !37 + %71 = or disjoint i32 %68, %70, !dbg !37 + %72 = or disjoint i32 %71, 8, !dbg !37 + %.not = icmp slt i32 %40, %42, !dbg !38 + br i1 %.not, label %4875, label %73, !dbg !39 + +73: ; preds = %29 + %74 = add i32 %18, 127, !dbg !40 + %75 = sdiv i32 %74, 128, !dbg !42 + %76 = sub i32 %40, %42, !dbg !43 + %.frozen = freeze i32 %76, !dbg !44 + %.frozen4059 = freeze i32 %75, !dbg !44 + %77 = sdiv i32 %.frozen, %.frozen4059, !dbg !44 + %78 = shl nuw nsw i32 %44, 2, !dbg !45 + %79 = add i32 %77, %78, !dbg !46 + %80 = mul i32 %77, %.frozen4059, !dbg !47 + %.decomposed = sub i32 %.frozen, %80, !dbg !47 + %81 = mul i32 %20, %45, !dbg !48 + %82 = add i32 %.decomposed, %81, !dbg !49 + %83 = mul i32 %21, %45, !dbg !50 + %reass.add = add i32 %.decomposed, %83, !dbg !51 + %reass.mul = mul i32 %reass.add, %22, !dbg !51 + %84 = shl i32 %79, 7, !dbg !52 + %85 = mul i32 %30, %43, !dbg !53 + %86 = add i32 %84, %85, !dbg !54 + %87 = sext i32 %86 to i64, !dbg !55 + %88 = mul i32 %79, %39, !dbg !56 + %89 = mul i32 %38, %43, !dbg !57 + %90 = add i32 %88, %89, !dbg !58 + %91 = sext i32 %90 to i64, !dbg !59 + %92 = shl nuw nsw i32 %43, 5, !dbg !60 + %93 = add i32 %79, %92, !dbg !61 + %94 = mul i32 %93, %18, !dbg !62 + %95 = sext i32 %94 to i64, !dbg !63 + %96 = getelementptr bfloat, ptr addrspace(1) %0, i64 %87, !dbg !64 + %97 = getelementptr bfloat, ptr addrspace(1) %5, i64 %91, !dbg !65 + %98 = getelementptr bfloat, ptr addrspace(1) %6, i64 %87, !dbg !66 + %99 = getelementptr float, ptr addrspace(1) %3, i64 %95, !dbg !67 + %100 = getelementptr float, ptr addrspace(1) %4, i64 %95, !dbg !68 + %101 = shl nsw i32 %.decomposed, 7, !dbg !69 + %102 = or disjoint i32 %101, %59, !dbg !70 + %103 = or disjoint i32 %101, %60, !dbg !70 + %104 = or disjoint i32 %101, %61, !dbg !70 + %105 = or disjoint i32 %101, %62, !dbg !70 + %106 = or disjoint i32 %101, %63, !dbg !70 + %107 = or disjoint i32 %101, %64, !dbg !70 + %108 = or disjoint i32 %101, %65, !dbg !70 + %109 = or disjoint i32 %101, %66, !dbg !70 + %110 = or disjoint i32 %101, %71, !dbg !70 + %111 = or disjoint i32 %101, %72, !dbg !70 + %112 = shl i32 %102, 12, !dbg !71 + %113 = shl i32 %103, 12, !dbg !71 + %114 = shl i32 %104, 12, !dbg !71 + %115 = shl i32 %105, 12, !dbg !71 + %116 = shl i32 %106, 12, !dbg !71 + %117 = shl i32 %107, 12, !dbg !71 + %118 = shl i32 %108, 12, !dbg !71 + %119 = shl i32 %109, 12, !dbg !71 + %120 = sext i32 %112 to i64, !dbg !74 + %121 = getelementptr bfloat, ptr addrspace(1) %96, i64 %120, !dbg !74 + %122 = sext i32 %113 to i64, !dbg !74 + %123 = getelementptr bfloat, ptr addrspace(1) %96, i64 %122, !dbg !74 + %124 = sext i32 %114 to i64, !dbg !74 + %125 = getelementptr bfloat, ptr addrspace(1) %96, i64 %124, !dbg !74 + %126 = sext i32 %115 to i64, !dbg !74 + %127 = getelementptr bfloat, ptr addrspace(1) %96, i64 %126, !dbg !74 + %128 = sext i32 %116 to i64, !dbg !74 + %129 = getelementptr bfloat, ptr addrspace(1) %96, i64 %128, !dbg !74 + %130 = sext i32 %117 to i64, !dbg !74 + %131 = getelementptr bfloat, ptr addrspace(1) %96, i64 %130, !dbg !74 + %132 = sext i32 %118 to i64, !dbg !74 + %133 = getelementptr bfloat, ptr addrspace(1) %96, i64 %132, !dbg !74 + %134 = sext i32 %119 to i64, !dbg !74 + %135 = getelementptr bfloat, ptr addrspace(1) %96, i64 %134, !dbg !74 + %136 = shl nuw nsw i32 %56, 3, !dbg !75 + %137 = and i32 %136, 120, !dbg !75 + %138 = zext nneg i32 %137 to i64, !dbg !76 + %139 = getelementptr bfloat, ptr addrspace(1) %121, i64 %138, !dbg !76 + %140 = getelementptr bfloat, ptr addrspace(1) %123, i64 %138, !dbg !76 + %141 = getelementptr bfloat, ptr addrspace(1) %125, i64 %138, !dbg !76 + %142 = getelementptr bfloat, ptr addrspace(1) %127, i64 %138, !dbg !76 + %143 = getelementptr bfloat, ptr addrspace(1) %129, i64 %138, !dbg !76 + %144 = getelementptr bfloat, ptr addrspace(1) %131, i64 %138, !dbg !76 + %145 = getelementptr bfloat, ptr addrspace(1) %133, i64 %138, !dbg !76 + %146 = getelementptr bfloat, ptr addrspace(1) %135, i64 %138, !dbg !76 + %147 = icmp slt i32 %102, %18, !dbg !77 + %148 = icmp slt i32 %103, %18, !dbg !77 + %149 = icmp slt i32 %104, %18, !dbg !77 + %150 = icmp slt i32 %105, %18, !dbg !77 + %151 = icmp slt i32 %106, %18, !dbg !77 + %152 = icmp slt i32 %107, %18, !dbg !77 + %153 = icmp slt i32 %108, %18, !dbg !77 + %154 = icmp slt i32 %109, %18, !dbg !77 + %155 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %139, i1 %147) #3, !dbg !78 + %156 = extractvalue { i32, i32, i32, i32 } %155, 0, !dbg !78 + %157 = extractvalue { i32, i32, i32, i32 } %155, 1, !dbg !78 + %158 = extractvalue { i32, i32, i32, i32 } %155, 2, !dbg !78 + %159 = extractvalue { i32, i32, i32, i32 } %155, 3, !dbg !78 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %140, i1 %148) #3, !dbg !78 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !78 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !78 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !78 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !78 + %165 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %141, i1 %149) #3, !dbg !78 + %166 = extractvalue { i32, i32, i32, i32 } %165, 0, !dbg !78 + %167 = extractvalue { i32, i32, i32, i32 } %165, 1, !dbg !78 + %168 = extractvalue { i32, i32, i32, i32 } %165, 2, !dbg !78 + %169 = extractvalue { i32, i32, i32, i32 } %165, 3, !dbg !78 + %170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %142, i1 %150) #3, !dbg !78 + %171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !78 + %172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !78 + %173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !78 + %174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !78 + %175 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %143, i1 %151) #3, !dbg !78 + %176 = extractvalue { i32, i32, i32, i32 } %175, 0, !dbg !78 + %177 = extractvalue { i32, i32, i32, i32 } %175, 1, !dbg !78 + %178 = extractvalue { i32, i32, i32, i32 } %175, 2, !dbg !78 + %179 = extractvalue { i32, i32, i32, i32 } %175, 3, !dbg !78 + %180 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %144, i1 %152) #3, !dbg !78 + %181 = extractvalue { i32, i32, i32, i32 } %180, 0, !dbg !78 + %182 = extractvalue { i32, i32, i32, i32 } %180, 1, !dbg !78 + %183 = extractvalue { i32, i32, i32, i32 } %180, 2, !dbg !78 + %184 = extractvalue { i32, i32, i32, i32 } %180, 3, !dbg !78 + %185 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %145, i1 %153) #3, !dbg !78 + %186 = extractvalue { i32, i32, i32, i32 } %185, 0, !dbg !78 + %187 = extractvalue { i32, i32, i32, i32 } %185, 1, !dbg !78 + %188 = extractvalue { i32, i32, i32, i32 } %185, 2, !dbg !78 + %189 = extractvalue { i32, i32, i32, i32 } %185, 3, !dbg !78 + %190 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %146, i1 %154) #3, !dbg !78 + %191 = extractvalue { i32, i32, i32, i32 } %190, 0, !dbg !78 + %192 = extractvalue { i32, i32, i32, i32 } %190, 1, !dbg !78 + %193 = extractvalue { i32, i32, i32, i32 } %190, 2, !dbg !78 + %194 = extractvalue { i32, i32, i32, i32 } %190, 3, !dbg !78 + %195 = shl nuw nsw i32 %56, 4, !dbg !78 + %196 = and i32 %195, 112, !dbg !78 + %197 = shl nuw nsw i32 %58, 3, !dbg !78 + %198 = and i32 %56, 112, !dbg !78 + %199 = and i32 %56, 8, !dbg !78 + %200 = shl nuw nsw i32 %199, 11, !dbg !78 + %201 = or disjoint i32 %196, %197, !dbg !78 + %202 = xor i32 %201, %198, !dbg !78 + %203 = or disjoint i32 %202, %200, !dbg !78 + %204 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %203, !dbg !78 + %205 = insertelement <4 x i32> poison, i32 %156, i64 0, !dbg !78 + %206 = insertelement <4 x i32> %205, i32 %157, i64 1, !dbg !78 + %207 = insertelement <4 x i32> %206, i32 %158, i64 2, !dbg !78 + %208 = insertelement <4 x i32> %207, i32 %159, i64 3, !dbg !78 + store <4 x i32> %208, ptr addrspace(3) %204, align 16, !dbg !78 + %209 = or disjoint i32 %203, 2048, !dbg !78 + %210 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %209, !dbg !78 + %211 = insertelement <4 x i32> poison, i32 %161, i64 0, !dbg !78 + %212 = insertelement <4 x i32> %211, i32 %162, i64 1, !dbg !78 + %213 = insertelement <4 x i32> %212, i32 %163, i64 2, !dbg !78 + %214 = insertelement <4 x i32> %213, i32 %164, i64 3, !dbg !78 + store <4 x i32> %214, ptr addrspace(3) %210, align 16, !dbg !78 + %215 = or disjoint i32 %203, 4096, !dbg !78 + %216 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %215, !dbg !78 + %217 = insertelement <4 x i32> poison, i32 %166, i64 0, !dbg !78 + %218 = insertelement <4 x i32> %217, i32 %167, i64 1, !dbg !78 + %219 = insertelement <4 x i32> %218, i32 %168, i64 2, !dbg !78 + %220 = insertelement <4 x i32> %219, i32 %169, i64 3, !dbg !78 + store <4 x i32> %220, ptr addrspace(3) %216, align 16, !dbg !78 + %221 = or disjoint i32 %203, 6144, !dbg !78 + %222 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %221, !dbg !78 + %223 = insertelement <4 x i32> poison, i32 %171, i64 0, !dbg !78 + %224 = insertelement <4 x i32> %223, i32 %172, i64 1, !dbg !78 + %225 = insertelement <4 x i32> %224, i32 %173, i64 2, !dbg !78 + %226 = insertelement <4 x i32> %225, i32 %174, i64 3, !dbg !78 + store <4 x i32> %226, ptr addrspace(3) %222, align 16, !dbg !78 + %227 = or disjoint i32 %203, 8192, !dbg !78 + %228 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %227, !dbg !78 + %229 = insertelement <4 x i32> poison, i32 %176, i64 0, !dbg !78 + %230 = insertelement <4 x i32> %229, i32 %177, i64 1, !dbg !78 + %231 = insertelement <4 x i32> %230, i32 %178, i64 2, !dbg !78 + %232 = insertelement <4 x i32> %231, i32 %179, i64 3, !dbg !78 + store <4 x i32> %232, ptr addrspace(3) %228, align 16, !dbg !78 + %233 = or disjoint i32 %203, 10240, !dbg !78 + %234 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %233, !dbg !78 + %235 = insertelement <4 x i32> poison, i32 %181, i64 0, !dbg !78 + %236 = insertelement <4 x i32> %235, i32 %182, i64 1, !dbg !78 + %237 = insertelement <4 x i32> %236, i32 %183, i64 2, !dbg !78 + %238 = insertelement <4 x i32> %237, i32 %184, i64 3, !dbg !78 + store <4 x i32> %238, ptr addrspace(3) %234, align 16, !dbg !78 + %239 = or disjoint i32 %203, 12288, !dbg !78 + %240 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %239, !dbg !78 + %241 = insertelement <4 x i32> poison, i32 %186, i64 0, !dbg !78 + %242 = insertelement <4 x i32> %241, i32 %187, i64 1, !dbg !78 + %243 = insertelement <4 x i32> %242, i32 %188, i64 2, !dbg !78 + %244 = insertelement <4 x i32> %243, i32 %189, i64 3, !dbg !78 + store <4 x i32> %244, ptr addrspace(3) %240, align 16, !dbg !78 + %245 = or disjoint i32 %203, 14336, !dbg !78 + %246 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %245, !dbg !78 + %247 = insertelement <4 x i32> poison, i32 %191, i64 0, !dbg !78 + %248 = insertelement <4 x i32> %247, i32 %192, i64 1, !dbg !78 + %249 = insertelement <4 x i32> %248, i32 %193, i64 2, !dbg !78 + %250 = insertelement <4 x i32> %249, i32 %194, i64 3, !dbg !78 + store <4 x i32> %250, ptr addrspace(3) %246, align 16, !dbg !78 + %251 = shl i32 %102, 7, !dbg !79 + %252 = shl i32 %103, 7, !dbg !79 + %253 = shl i32 %104, 7, !dbg !79 + %254 = shl i32 %105, 7, !dbg !79 + %255 = shl i32 %106, 7, !dbg !79 + %256 = shl i32 %107, 7, !dbg !79 + %257 = shl i32 %108, 7, !dbg !79 + %258 = shl i32 %109, 7, !dbg !79 + %259 = sext i32 %251 to i64, !dbg !81 + %260 = getelementptr bfloat, ptr addrspace(1) %97, i64 %259, !dbg !81 + %261 = sext i32 %252 to i64, !dbg !81 + %262 = getelementptr bfloat, ptr addrspace(1) %97, i64 %261, !dbg !81 + %263 = sext i32 %253 to i64, !dbg !81 + %264 = getelementptr bfloat, ptr addrspace(1) %97, i64 %263, !dbg !81 + %265 = sext i32 %254 to i64, !dbg !81 + %266 = getelementptr bfloat, ptr addrspace(1) %97, i64 %265, !dbg !81 + %267 = sext i32 %255 to i64, !dbg !81 + %268 = getelementptr bfloat, ptr addrspace(1) %97, i64 %267, !dbg !81 + %269 = sext i32 %256 to i64, !dbg !81 + %270 = getelementptr bfloat, ptr addrspace(1) %97, i64 %269, !dbg !81 + %271 = sext i32 %257 to i64, !dbg !81 + %272 = getelementptr bfloat, ptr addrspace(1) %97, i64 %271, !dbg !81 + %273 = sext i32 %258 to i64, !dbg !81 + %274 = getelementptr bfloat, ptr addrspace(1) %97, i64 %273, !dbg !81 + %275 = getelementptr bfloat, ptr addrspace(1) %260, i64 %138, !dbg !82 + %276 = getelementptr bfloat, ptr addrspace(1) %262, i64 %138, !dbg !82 + %277 = getelementptr bfloat, ptr addrspace(1) %264, i64 %138, !dbg !82 + %278 = getelementptr bfloat, ptr addrspace(1) %266, i64 %138, !dbg !82 + %279 = getelementptr bfloat, ptr addrspace(1) %268, i64 %138, !dbg !82 + %280 = getelementptr bfloat, ptr addrspace(1) %270, i64 %138, !dbg !82 + %281 = getelementptr bfloat, ptr addrspace(1) %272, i64 %138, !dbg !82 + %282 = getelementptr bfloat, ptr addrspace(1) %274, i64 %138, !dbg !82 + %283 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %275, i1 %147) #3, !dbg !83 + %284 = extractvalue { i32, i32, i32, i32 } %283, 0, !dbg !83 + %285 = extractvalue { i32, i32, i32, i32 } %283, 1, !dbg !83 + %286 = extractvalue { i32, i32, i32, i32 } %283, 2, !dbg !83 + %287 = extractvalue { i32, i32, i32, i32 } %283, 3, !dbg !83 + %288 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %276, i1 %148) #3, !dbg !83 + %289 = extractvalue { i32, i32, i32, i32 } %288, 0, !dbg !83 + %290 = extractvalue { i32, i32, i32, i32 } %288, 1, !dbg !83 + %291 = extractvalue { i32, i32, i32, i32 } %288, 2, !dbg !83 + %292 = extractvalue { i32, i32, i32, i32 } %288, 3, !dbg !83 + %293 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %277, i1 %149) #3, !dbg !83 + %294 = extractvalue { i32, i32, i32, i32 } %293, 0, !dbg !83 + %295 = extractvalue { i32, i32, i32, i32 } %293, 1, !dbg !83 + %296 = extractvalue { i32, i32, i32, i32 } %293, 2, !dbg !83 + %297 = extractvalue { i32, i32, i32, i32 } %293, 3, !dbg !83 + %298 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %278, i1 %150) #3, !dbg !83 + %299 = extractvalue { i32, i32, i32, i32 } %298, 0, !dbg !83 + %300 = extractvalue { i32, i32, i32, i32 } %298, 1, !dbg !83 + %301 = extractvalue { i32, i32, i32, i32 } %298, 2, !dbg !83 + %302 = extractvalue { i32, i32, i32, i32 } %298, 3, !dbg !83 + %303 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %279, i1 %151) #3, !dbg !83 + %304 = extractvalue { i32, i32, i32, i32 } %303, 0, !dbg !83 + %305 = extractvalue { i32, i32, i32, i32 } %303, 1, !dbg !83 + %306 = extractvalue { i32, i32, i32, i32 } %303, 2, !dbg !83 + %307 = extractvalue { i32, i32, i32, i32 } %303, 3, !dbg !83 + %308 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %280, i1 %152) #3, !dbg !83 + %309 = extractvalue { i32, i32, i32, i32 } %308, 0, !dbg !83 + %310 = extractvalue { i32, i32, i32, i32 } %308, 1, !dbg !83 + %311 = extractvalue { i32, i32, i32, i32 } %308, 2, !dbg !83 + %312 = extractvalue { i32, i32, i32, i32 } %308, 3, !dbg !83 + %313 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %281, i1 %153) #3, !dbg !83 + %314 = extractvalue { i32, i32, i32, i32 } %313, 0, !dbg !83 + %315 = extractvalue { i32, i32, i32, i32 } %313, 1, !dbg !83 + %316 = extractvalue { i32, i32, i32, i32 } %313, 2, !dbg !83 + %317 = extractvalue { i32, i32, i32, i32 } %313, 3, !dbg !83 + %318 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %282, i1 %154) #3, !dbg !83 + %319 = extractvalue { i32, i32, i32, i32 } %318, 0, !dbg !83 + %320 = extractvalue { i32, i32, i32, i32 } %318, 1, !dbg !83 + %321 = extractvalue { i32, i32, i32, i32 } %318, 2, !dbg !83 + %322 = extractvalue { i32, i32, i32, i32 } %318, 3, !dbg !83 + %323 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %203, !dbg !83 + %324 = insertelement <4 x i32> poison, i32 %284, i64 0, !dbg !83 + %325 = insertelement <4 x i32> %324, i32 %285, i64 1, !dbg !83 + %326 = insertelement <4 x i32> %325, i32 %286, i64 2, !dbg !83 + %327 = insertelement <4 x i32> %326, i32 %287, i64 3, !dbg !83 + store <4 x i32> %327, ptr addrspace(3) %323, align 16, !dbg !83 + %328 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %209, !dbg !83 + %329 = insertelement <4 x i32> poison, i32 %289, i64 0, !dbg !83 + %330 = insertelement <4 x i32> %329, i32 %290, i64 1, !dbg !83 + %331 = insertelement <4 x i32> %330, i32 %291, i64 2, !dbg !83 + %332 = insertelement <4 x i32> %331, i32 %292, i64 3, !dbg !83 + store <4 x i32> %332, ptr addrspace(3) %328, align 16, !dbg !83 + %333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %215, !dbg !83 + %334 = insertelement <4 x i32> poison, i32 %294, i64 0, !dbg !83 + %335 = insertelement <4 x i32> %334, i32 %295, i64 1, !dbg !83 + %336 = insertelement <4 x i32> %335, i32 %296, i64 2, !dbg !83 + %337 = insertelement <4 x i32> %336, i32 %297, i64 3, !dbg !83 + store <4 x i32> %337, ptr addrspace(3) %333, align 16, !dbg !83 + %338 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %221, !dbg !83 + %339 = insertelement <4 x i32> poison, i32 %299, i64 0, !dbg !83 + %340 = insertelement <4 x i32> %339, i32 %300, i64 1, !dbg !83 + %341 = insertelement <4 x i32> %340, i32 %301, i64 2, !dbg !83 + %342 = insertelement <4 x i32> %341, i32 %302, i64 3, !dbg !83 + store <4 x i32> %342, ptr addrspace(3) %338, align 16, !dbg !83 + %343 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %227, !dbg !83 + %344 = insertelement <4 x i32> poison, i32 %304, i64 0, !dbg !83 + %345 = insertelement <4 x i32> %344, i32 %305, i64 1, !dbg !83 + %346 = insertelement <4 x i32> %345, i32 %306, i64 2, !dbg !83 + %347 = insertelement <4 x i32> %346, i32 %307, i64 3, !dbg !83 + store <4 x i32> %347, ptr addrspace(3) %343, align 16, !dbg !83 + %348 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %233, !dbg !83 + %349 = insertelement <4 x i32> poison, i32 %309, i64 0, !dbg !83 + %350 = insertelement <4 x i32> %349, i32 %310, i64 1, !dbg !83 + %351 = insertelement <4 x i32> %350, i32 %311, i64 2, !dbg !83 + %352 = insertelement <4 x i32> %351, i32 %312, i64 3, !dbg !83 + store <4 x i32> %352, ptr addrspace(3) %348, align 16, !dbg !83 + %353 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %239, !dbg !83 + %354 = insertelement <4 x i32> poison, i32 %314, i64 0, !dbg !83 + %355 = insertelement <4 x i32> %354, i32 %315, i64 1, !dbg !83 + %356 = insertelement <4 x i32> %355, i32 %316, i64 2, !dbg !83 + %357 = insertelement <4 x i32> %356, i32 %317, i64 3, !dbg !83 + store <4 x i32> %357, ptr addrspace(3) %353, align 16, !dbg !83 + %358 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %245, !dbg !83 + %359 = insertelement <4 x i32> poison, i32 %319, i64 0, !dbg !83 + %360 = insertelement <4 x i32> %359, i32 %320, i64 1, !dbg !83 + %361 = insertelement <4 x i32> %360, i32 %321, i64 2, !dbg !83 + %362 = insertelement <4 x i32> %361, i32 %322, i64 3, !dbg !83 + store <4 x i32> %362, ptr addrspace(3) %358, align 16, !dbg !83 + %363 = icmp slt i32 %110, %18, !dbg !84 + %364 = icmp slt i32 %111, %18, !dbg !84 + %365 = sext i32 %110 to i64, !dbg !85 + %366 = getelementptr float, ptr addrspace(1) %100, i64 %365, !dbg !85 + %367 = sext i32 %111 to i64, !dbg !85 + %368 = getelementptr float, ptr addrspace(1) %100, i64 %367, !dbg !85 + %369 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %366, i1 %363) #3, !dbg !86 + %370 = bitcast i32 %369 to float, !dbg !86 + %371 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %368, i1 %364) #3, !dbg !86 + %372 = bitcast i32 %371 to float, !dbg !86 + %373 = getelementptr float, ptr addrspace(1) %99, i64 %365, !dbg !87 + %374 = getelementptr float, ptr addrspace(1) %99, i64 %367, !dbg !87 + %375 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %373, i1 %363) #3, !dbg !88 + %376 = bitcast i32 %375 to float, !dbg !88 + %377 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %374, i1 %364) #3, !dbg !88 + %378 = bitcast i32 %377 to float, !dbg !88 + %379 = fcmp oeq float %376, 0xFFF0000000000000, !dbg !89 + %380 = fcmp oeq float %378, 0xFFF0000000000000, !dbg !89 + %381 = select i1 %379, float 0.000000e+00, float %376, !dbg !90 + %382 = select i1 %380, float 0.000000e+00, float %378, !dbg !90 + %383 = sext i32 %reass.mul to i64, !dbg !91 + %384 = getelementptr i32, ptr addrspace(1) %9, i64 %383, !dbg !91 + %385 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %384) #3, !dbg !92 + %386 = shl i32 %385, 7, !dbg !93 + %387 = sext i32 %82 to i64, !dbg !94 + %388 = getelementptr i32, ptr addrspace(1) %8, i64 %387, !dbg !94 + %389 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %388) #3, !dbg !95 + %390 = and i32 %56, 3, !dbg !96 + %391 = shl nuw nsw i32 %390, 1, !dbg !96 + %392 = or disjoint i32 %391, 1, !dbg !96 + %393 = insertelement <2 x i32> poison, i32 %391, i64 0, !dbg !96 + %394 = shufflevector <2 x i32> %393, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !96 + %395 = or disjoint <2 x i32> %394, , !dbg !96 + %396 = insertelement <4 x i32> poison, i32 %391, i64 0, !dbg !96 + %397 = shufflevector <4 x i32> %396, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !96 + %398 = or disjoint <4 x i32> %397, , !dbg !96 + %399 = insertelement <8 x i32> poison, i32 %391, i64 0, !dbg !96 + %400 = shufflevector <8 x i32> %399, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !96 + %401 = or disjoint <8 x i32> %400, , !dbg !96 + %402 = or disjoint i32 %386, %59, !dbg !97 + %403 = or disjoint i32 %386, %60, !dbg !97 + %404 = or disjoint i32 %386, %61, !dbg !97 + %405 = or disjoint i32 %386, %62, !dbg !97 + %406 = shl i32 %402, 7, !dbg !98 + %407 = shl i32 %403, 7, !dbg !98 + %408 = shl i32 %404, 7, !dbg !98 + %409 = shl i32 %405, 7, !dbg !98 + %410 = sext i32 %406 to i64, !dbg !100 + %411 = getelementptr bfloat, ptr addrspace(1) %53, i64 %410, !dbg !100 + %412 = sext i32 %407 to i64, !dbg !100 + %413 = getelementptr bfloat, ptr addrspace(1) %53, i64 %412, !dbg !100 + %414 = sext i32 %408 to i64, !dbg !100 + %415 = getelementptr bfloat, ptr addrspace(1) %53, i64 %414, !dbg !100 + %416 = sext i32 %409 to i64, !dbg !100 + %417 = getelementptr bfloat, ptr addrspace(1) %53, i64 %416, !dbg !100 + %418 = getelementptr bfloat, ptr addrspace(1) %411, i64 %138, !dbg !101 + %419 = getelementptr bfloat, ptr addrspace(1) %413, i64 %138, !dbg !101 + %420 = getelementptr bfloat, ptr addrspace(1) %415, i64 %138, !dbg !101 + %421 = getelementptr bfloat, ptr addrspace(1) %417, i64 %138, !dbg !101 + %422 = getelementptr bfloat, ptr addrspace(1) %54, i64 %410, !dbg !102 + %423 = getelementptr bfloat, ptr addrspace(1) %54, i64 %412, !dbg !102 + %424 = getelementptr bfloat, ptr addrspace(1) %54, i64 %414, !dbg !102 + %425 = getelementptr bfloat, ptr addrspace(1) %54, i64 %416, !dbg !102 + %426 = getelementptr bfloat, ptr addrspace(1) %422, i64 %138, !dbg !103 + %427 = getelementptr bfloat, ptr addrspace(1) %423, i64 %138, !dbg !103 + %428 = getelementptr bfloat, ptr addrspace(1) %424, i64 %138, !dbg !103 + %429 = getelementptr bfloat, ptr addrspace(1) %425, i64 %138, !dbg !103 + %430 = shl i32 %389, 1, !dbg !104 + %431 = add i32 %19, 63, !dbg !105 + %432 = sdiv i32 %431, 64, !dbg !106 + %433 = tail call i32 @llvm.smax.i32(i32 %432, i32 1), !dbg !107 + %434 = tail call i32 @llvm.smin.i32(i32 %430, i32 %433), !dbg !108 + %435 = zext nneg i32 %43 to i64, !dbg !109 + %436 = getelementptr i64, ptr addrspace(1) %16, i64 %435, !dbg !109 + %437 = icmp sgt i32 %430, 0, !dbg !110 + %438 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %436, i1 %437) #3, !dbg !111 + %439 = icmp slt i32 %402, %19, !dbg !112 + %440 = icmp slt i32 %403, %19, !dbg !112 + %441 = icmp slt i32 %404, %19, !dbg !112 + %442 = icmp slt i32 %405, %19, !dbg !112 + %443 = and i1 %437, %439, !dbg !110 + %444 = and i1 %437, %440, !dbg !110 + %445 = and i1 %437, %441, !dbg !110 + %446 = and i1 %437, %442, !dbg !110 + %447 = shl nuw nsw i32 %199, 10, !dbg !113 + %448 = or disjoint i32 %202, %447, !dbg !113 + %449 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %448, !dbg !113 + %450 = select i1 %443, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %449, ptr addrspace(1) %418, i32 %450) #3, !dbg !113 + %451 = or disjoint i32 %448, 2048, !dbg !113 + %452 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %451, !dbg !113 + %453 = select i1 %444, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %452, ptr addrspace(1) %419, i32 %453) #3, !dbg !113 + %454 = or disjoint i32 %448, 4096, !dbg !113 + %455 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %454, !dbg !113 + %456 = select i1 %445, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %455, ptr addrspace(1) %420, i32 %456) #3, !dbg !113 + %457 = or disjoint i32 %448, 6144, !dbg !113 + %458 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %457, !dbg !113 + %459 = select i1 %446, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %458, ptr addrspace(1) %421, i32 %459) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %460 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %448, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %460, ptr addrspace(1) %426, i32 %450) #3, !dbg !113 + %461 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %451, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %461, ptr addrspace(1) %427, i32 %453) #3, !dbg !113 + %462 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %454, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %462, ptr addrspace(1) %428, i32 %456) #3, !dbg !113 + %463 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %457, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %463, ptr addrspace(1) %429, i32 %459) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %464 = icmp sgt i32 %434, 1, !dbg !110 + %465 = getelementptr i8, ptr addrspace(1) %418, i64 16384, !dbg !114 + %466 = getelementptr i8, ptr addrspace(1) %419, i64 16384, !dbg !114 + %467 = getelementptr i8, ptr addrspace(1) %420, i64 16384, !dbg !114 + %468 = getelementptr i8, ptr addrspace(1) %421, i64 16384, !dbg !114 + %469 = getelementptr i8, ptr addrspace(1) %426, i64 16384, !dbg !115 + %470 = getelementptr i8, ptr addrspace(1) %427, i64 16384, !dbg !115 + %471 = getelementptr i8, ptr addrspace(1) %428, i64 16384, !dbg !115 + %472 = getelementptr i8, ptr addrspace(1) %429, i64 16384, !dbg !115 + %473 = or disjoint i32 %402, 64, !dbg !116 + %474 = or disjoint i32 %403, 64, !dbg !116 + %475 = or disjoint i32 %404, 64, !dbg !116 + %476 = or disjoint i32 %405, 64, !dbg !116 + %477 = icmp slt i32 %473, %19, !dbg !112 + %478 = icmp slt i32 %474, %19, !dbg !112 + %479 = icmp slt i32 %475, %19, !dbg !112 + %480 = icmp slt i32 %476, %19, !dbg !112 + %481 = and i1 %464, %477, !dbg !110 + %482 = and i1 %464, %478, !dbg !110 + %483 = and i1 %464, %479, !dbg !110 + %484 = and i1 %464, %480, !dbg !110 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %485 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %448, !dbg !113 + %486 = select i1 %481, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %485, ptr addrspace(1) %465, i32 %486) #3, !dbg !113 + %487 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %451, !dbg !113 + %488 = select i1 %482, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %487, ptr addrspace(1) %466, i32 %488) #3, !dbg !113 + %489 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %454, !dbg !113 + %490 = select i1 %483, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %489, ptr addrspace(1) %467, i32 %490) #3, !dbg !113 + %491 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %457, !dbg !113 + %492 = select i1 %484, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %491, ptr addrspace(1) %468, i32 %492) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %493 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %448, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %493, ptr addrspace(1) %469, i32 %486) #3, !dbg !113 + %494 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %451, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %494, ptr addrspace(1) %470, i32 %488) #3, !dbg !113 + %495 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %454, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %495, ptr addrspace(1) %471, i32 %490) #3, !dbg !113 + %496 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %457, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %496, ptr addrspace(1) %472, i32 %492) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !117 + br i1 %437, label %.lr.ph, label %._crit_edge, !dbg !110 + +.lr.ph: ; preds = %73 + %497 = srem i32 %111, %18, !dbg !118 + %498 = sext i32 %497 to i64, !dbg !119 + %499 = icmp sgt i64 %438, %498, !dbg !120 + %500 = srem i32 %110, %18, !dbg !118 + %501 = sext i32 %500 to i64, !dbg !119 + %502 = icmp sgt i64 %438, %501, !dbg !120 + %503 = insertelement <2 x i32> poison, i32 %386, i64 0, !dbg !97 + %504 = shufflevector <2 x i32> %503, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !97 + %505 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %506 = or disjoint <2 x i32> %504, %505, !dbg !97 + %507 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %508 = or disjoint <2 x i32> %504, %507, !dbg !97 + %509 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %510 = or disjoint <2 x i32> %504, %509, !dbg !97 + %511 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %512 = or disjoint <2 x i32> %504, %511, !dbg !97 + %513 = shufflevector <4 x i32> %398, <4 x i32> poison, <2 x i32> , !dbg !97 + %514 = or disjoint <2 x i32> %504, %513, !dbg !97 + %515 = shufflevector <4 x i32> %398, <4 x i32> poison, <2 x i32> , !dbg !97 + %516 = or disjoint <2 x i32> %504, %515, !dbg !97 + %517 = or disjoint <2 x i32> %504, %395, !dbg !97 + %518 = insertelement <2 x i32> %393, i32 %392, i64 1, !dbg !97 + %519 = or disjoint <2 x i32> %504, %518, !dbg !97 + %520 = add nsw i32 %434, -2 + %521 = add nsw i32 %434, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %434, i32 1), !dbg !110 + %522 = insertelement <2 x i1> poison, i1 %499, i64 0, !dbg !121 + %523 = shufflevector <2 x i1> %522, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !121 + %524 = insertelement <2 x i32> poison, i32 %497, i64 0, !dbg !122 + %525 = shufflevector <2 x i32> %524, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !122 + %526 = insertelement <2 x i32> poison, i32 %26, i64 0, !dbg !123 + %527 = shufflevector <2 x i32> %526, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !123 + %528 = insertelement <2 x i64> poison, i64 %438, i64 0, !dbg !124 + %529 = shufflevector <2 x i64> %528, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !124 + %530 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !112 + %531 = shufflevector <2 x i32> %530, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !112 + %532 = insertelement <2 x float> poison, float %372, i64 0, !dbg !125 + %533 = shufflevector <2 x float> %532, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !125 + %534 = insertelement <2 x i1> poison, i1 %502, i64 0, !dbg !121 + %535 = shufflevector <2 x i1> %534, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !121 + %536 = insertelement <2 x i32> poison, i32 %500, i64 0, !dbg !122 + %537 = shufflevector <2 x i32> %536, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !122 + %538 = insertelement <2 x float> poison, float %370, i64 0, !dbg !125 + %539 = shufflevector <2 x float> %538, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !125 + br label %540, !dbg !110 + +540: ; preds = %.lr.ph, %__nv_exp2f.exit1527 + %541 = phi i32 [ 64, %.lr.ph ], [ %2525, %__nv_exp2f.exit1527 ] + %542 = phi i32 [ -1, %.lr.ph ], [ %621, %__nv_exp2f.exit1527 ] + %543 = phi i32 [ 1, %.lr.ph ], [ %2542, %__nv_exp2f.exit1527 ] + %.pn8981546 = phi ptr addrspace(1) [ %472, %.lr.ph ], [ %2535, %__nv_exp2f.exit1527 ] + %.pn9141545 = phi ptr addrspace(1) [ %471, %.lr.ph ], [ %2534, %__nv_exp2f.exit1527 ] + %.pn9301544 = phi ptr addrspace(1) [ %470, %.lr.ph ], [ %2533, %__nv_exp2f.exit1527 ] + %.pn9461543 = phi ptr addrspace(1) [ %469, %.lr.ph ], [ %2532, %__nv_exp2f.exit1527 ] + %.pn8761542 = phi i32 [ %476, %.lr.ph ], [ %2539, %__nv_exp2f.exit1527 ] + %.pn8781541 = phi i32 [ %475, %.lr.ph ], [ %2538, %__nv_exp2f.exit1527 ] + %.pn8801540 = phi i32 [ %474, %.lr.ph ], [ %2537, %__nv_exp2f.exit1527 ] + %.pn8821539 = phi i32 [ %473, %.lr.ph ], [ %2536, %__nv_exp2f.exit1527 ] + %.pn8261538 = phi ptr addrspace(1) [ %468, %.lr.ph ], [ %2531, %__nv_exp2f.exit1527 ] + %.pn8421537 = phi ptr addrspace(1) [ %467, %.lr.ph ], [ %2530, %__nv_exp2f.exit1527 ] + %.pn8581536 = phi ptr addrspace(1) [ %466, %.lr.ph ], [ %2529, %__nv_exp2f.exit1527 ] + %.pn8741535 = phi ptr addrspace(1) [ %465, %.lr.ph ], [ %2528, %__nv_exp2f.exit1527 ] + %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %2432, %__nv_exp2f.exit1527 ] + %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %2433, %__nv_exp2f.exit1527 ] + %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %2434, %__nv_exp2f.exit1527 ] + %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %2435, %__nv_exp2f.exit1527 ] + %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %2436, %__nv_exp2f.exit1527 ] + %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %2437, %__nv_exp2f.exit1527 ] + %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %2438, %__nv_exp2f.exit1527 ] + %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %2439, %__nv_exp2f.exit1527 ] + %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %2440, %__nv_exp2f.exit1527 ] + %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %2441, %__nv_exp2f.exit1527 ] + %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %2442, %__nv_exp2f.exit1527 ] + %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %2443, %__nv_exp2f.exit1527 ] + %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %2444, %__nv_exp2f.exit1527 ] + %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %2445, %__nv_exp2f.exit1527 ] + %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %2446, %__nv_exp2f.exit1527 ] + %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %2447, %__nv_exp2f.exit1527 ] + %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %2448, %__nv_exp2f.exit1527 ] + %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %2449, %__nv_exp2f.exit1527 ] + %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %2450, %__nv_exp2f.exit1527 ] + %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %2451, %__nv_exp2f.exit1527 ] + %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %2452, %__nv_exp2f.exit1527 ] + %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %2453, %__nv_exp2f.exit1527 ] + %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %2454, %__nv_exp2f.exit1527 ] + %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %2455, %__nv_exp2f.exit1527 ] + %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %2456, %__nv_exp2f.exit1527 ] + %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %2457, %__nv_exp2f.exit1527 ] + %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %2458, %__nv_exp2f.exit1527 ] + %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %2459, %__nv_exp2f.exit1527 ] + %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %2460, %__nv_exp2f.exit1527 ] + %573 = phi float [ 0.000000e+00, %.lr.ph ], [ %2461, %__nv_exp2f.exit1527 ] + %574 = phi float [ 0.000000e+00, %.lr.ph ], [ %2462, %__nv_exp2f.exit1527 ] + %575 = phi float [ 0.000000e+00, %.lr.ph ], [ %2463, %__nv_exp2f.exit1527 ] + %576 = phi float [ 0.000000e+00, %.lr.ph ], [ %2464, %__nv_exp2f.exit1527 ] + %577 = phi float [ 0.000000e+00, %.lr.ph ], [ %2465, %__nv_exp2f.exit1527 ] + %578 = phi float [ 0.000000e+00, %.lr.ph ], [ %2466, %__nv_exp2f.exit1527 ] + %579 = phi float [ 0.000000e+00, %.lr.ph ], [ %2467, %__nv_exp2f.exit1527 ] + %580 = phi float [ 0.000000e+00, %.lr.ph ], [ %2468, %__nv_exp2f.exit1527 ] + %581 = phi float [ 0.000000e+00, %.lr.ph ], [ %2469, %__nv_exp2f.exit1527 ] + %582 = phi float [ 0.000000e+00, %.lr.ph ], [ %2470, %__nv_exp2f.exit1527 ] + %583 = phi float [ 0.000000e+00, %.lr.ph ], [ %2471, %__nv_exp2f.exit1527 ] + %584 = phi float [ 0.000000e+00, %.lr.ph ], [ %2472, %__nv_exp2f.exit1527 ] + %585 = phi float [ 0.000000e+00, %.lr.ph ], [ %2473, %__nv_exp2f.exit1527 ] + %586 = phi float [ 0.000000e+00, %.lr.ph ], [ %2474, %__nv_exp2f.exit1527 ] + %587 = phi float [ 0.000000e+00, %.lr.ph ], [ %2475, %__nv_exp2f.exit1527 ] + %588 = phi float [ 0.000000e+00, %.lr.ph ], [ %2476, %__nv_exp2f.exit1527 ] + %589 = phi float [ 0.000000e+00, %.lr.ph ], [ %2477, %__nv_exp2f.exit1527 ] + %590 = phi float [ 0.000000e+00, %.lr.ph ], [ %2478, %__nv_exp2f.exit1527 ] + %591 = phi float [ 0.000000e+00, %.lr.ph ], [ %2479, %__nv_exp2f.exit1527 ] + %592 = phi float [ 0.000000e+00, %.lr.ph ], [ %2480, %__nv_exp2f.exit1527 ] + %593 = phi float [ 0.000000e+00, %.lr.ph ], [ %2481, %__nv_exp2f.exit1527 ] + %594 = phi float [ 0.000000e+00, %.lr.ph ], [ %2482, %__nv_exp2f.exit1527 ] + %595 = phi float [ 0.000000e+00, %.lr.ph ], [ %2483, %__nv_exp2f.exit1527 ] + %596 = phi float [ 0.000000e+00, %.lr.ph ], [ %2484, %__nv_exp2f.exit1527 ] + %597 = phi float [ 0.000000e+00, %.lr.ph ], [ %2485, %__nv_exp2f.exit1527 ] + %598 = phi float [ 0.000000e+00, %.lr.ph ], [ %2486, %__nv_exp2f.exit1527 ] + %599 = phi float [ 0.000000e+00, %.lr.ph ], [ %2487, %__nv_exp2f.exit1527 ] + %600 = phi float [ 0.000000e+00, %.lr.ph ], [ %2488, %__nv_exp2f.exit1527 ] + %601 = phi float [ 0.000000e+00, %.lr.ph ], [ %2489, %__nv_exp2f.exit1527 ] + %602 = phi float [ 0.000000e+00, %.lr.ph ], [ %2490, %__nv_exp2f.exit1527 ] + %603 = phi float [ 0.000000e+00, %.lr.ph ], [ %2491, %__nv_exp2f.exit1527 ] + %604 = phi float [ 0.000000e+00, %.lr.ph ], [ %2492, %__nv_exp2f.exit1527 ] + %605 = phi float [ 0.000000e+00, %.lr.ph ], [ %2493, %__nv_exp2f.exit1527 ] + %606 = phi float [ 0.000000e+00, %.lr.ph ], [ %2494, %__nv_exp2f.exit1527 ] + %607 = phi float [ 0.000000e+00, %.lr.ph ], [ %2495, %__nv_exp2f.exit1527 ] + %608 = phi i32 [ 0, %.lr.ph ], [ %2506, %__nv_exp2f.exit1527 ] + %609 = phi <2 x i32> [ %506, %.lr.ph ], [ %2505, %__nv_exp2f.exit1527 ] + %610 = phi <2 x i32> [ %508, %.lr.ph ], [ %2504, %__nv_exp2f.exit1527 ] + %611 = phi <2 x i32> [ %510, %.lr.ph ], [ %2503, %__nv_exp2f.exit1527 ] + %612 = phi <2 x i32> [ %512, %.lr.ph ], [ %2502, %__nv_exp2f.exit1527 ] + %613 = phi <2 x i32> [ %514, %.lr.ph ], [ %2501, %__nv_exp2f.exit1527 ] + %614 = phi <2 x i32> [ %516, %.lr.ph ], [ %2500, %__nv_exp2f.exit1527 ] + %615 = phi <2 x i32> [ %517, %.lr.ph ], [ %2499, %__nv_exp2f.exit1527 ] + %616 = phi <2 x i32> [ %519, %.lr.ph ], [ %2498, %__nv_exp2f.exit1527 ] + %617 = icmp slt i32 %608, %520, !dbg !110 + %618 = icmp slt i32 %608, %521, !dbg !110 + %619 = add i32 %542, 1, !dbg !110 + %620 = icmp sgt i32 %619, 2, !dbg !110 + %621 = select i1 %620, i32 0, i32 %619, !dbg !110 + %622 = icmp slt <2 x i32> %616, %531, !dbg !112 + %623 = icmp slt <2 x i32> %615, %531, !dbg !112 + %624 = icmp slt <2 x i32> %614, %531, !dbg !112 + %625 = icmp slt <2 x i32> %613, %531, !dbg !112 + %626 = icmp slt <2 x i32> %612, %531, !dbg !112 + %627 = icmp slt <2 x i32> %611, %531, !dbg !112 + %628 = icmp slt <2 x i32> %610, %531, !dbg !112 + %629 = icmp slt <2 x i32> %609, %531, !dbg !112 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %630 = shl i32 %621, 13, !dbg !113 + %631 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %630, !dbg !113 + %632 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !117 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !117 + %633 = shl i32 %632, 11, !dbg !117 + %634 = and i32 %633, 8192, !dbg !117 + %635 = add i32 %634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %636 = lshr exact i32 %635, 4, !dbg !117 + %637 = and i32 %636, 16383, !dbg !117 + %638 = zext nneg i32 %637 to i64, !dbg !117 + %639 = or disjoint i64 %638, 4611686293372403712, !dbg !117 + %640 = ptrtoint ptr addrspace(3) %631 to i32, !dbg !117 + %641 = lshr exact i32 %640, 4, !dbg !117 + %642 = and i32 %641, 16383, !dbg !117 + %643 = zext nneg i32 %642 to i64, !dbg !117 + %644 = or disjoint i64 %643, 4611686293338849280, !dbg !117 + %645 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %639, i64 %644) #3, !dbg !117 + %646 = or disjoint i32 %634, 32, !dbg !117 + %647 = add i32 %646, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %648 = lshr exact i32 %647, 4, !dbg !117 + %649 = and i32 %648, 16383, !dbg !117 + %650 = zext nneg i32 %649 to i64, !dbg !117 + %651 = or disjoint i64 %650, 4611686293372403712, !dbg !117 + %652 = add i32 %640, 32, !dbg !117 + %653 = lshr exact i32 %652, 4, !dbg !117 + %654 = and i32 %653, 16383, !dbg !117 + %655 = zext nneg i32 %654 to i64, !dbg !117 + %656 = or disjoint i64 %655, 4611686293338849280, !dbg !117 + %657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 0, !dbg !117 + %658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 1, !dbg !117 + %659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 2, !dbg !117 + %660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 3, !dbg !117 + %661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 4, !dbg !117 + %662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 5, !dbg !117 + %663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 6, !dbg !117 + %664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 7, !dbg !117 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 8, !dbg !117 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 9, !dbg !117 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 10, !dbg !117 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 11, !dbg !117 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 12, !dbg !117 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 13, !dbg !117 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 14, !dbg !117 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 15, !dbg !117 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 16, !dbg !117 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 17, !dbg !117 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 18, !dbg !117 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 19, !dbg !117 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 20, !dbg !117 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 21, !dbg !117 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 22, !dbg !117 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 23, !dbg !117 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 24, !dbg !117 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 25, !dbg !117 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 26, !dbg !117 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 27, !dbg !117 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 28, !dbg !117 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 29, !dbg !117 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 30, !dbg !117 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 31, !dbg !117 + %689 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %657, float %658, float %659, float %660, float %661, float %662, float %663, float %664, float %665, float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, i64 %651, i64 %656, i1 true) #3, !dbg !117 + %690 = or disjoint i32 %634, 64, !dbg !117 + %691 = add i32 %690, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %692 = lshr exact i32 %691, 4, !dbg !117 + %693 = and i32 %692, 16383, !dbg !117 + %694 = zext nneg i32 %693 to i64, !dbg !117 + %695 = or disjoint i64 %694, 4611686293372403712, !dbg !117 + %696 = add i32 %640, 64, !dbg !117 + %697 = lshr exact i32 %696, 4, !dbg !117 + %698 = and i32 %697, 16383, !dbg !117 + %699 = zext nneg i32 %698 to i64, !dbg !117 + %700 = or disjoint i64 %699, 4611686293338849280, !dbg !117 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 0, !dbg !117 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 1, !dbg !117 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 2, !dbg !117 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 3, !dbg !117 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 4, !dbg !117 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 5, !dbg !117 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 6, !dbg !117 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 7, !dbg !117 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 8, !dbg !117 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 9, !dbg !117 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 10, !dbg !117 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 11, !dbg !117 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 12, !dbg !117 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 13, !dbg !117 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 14, !dbg !117 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 15, !dbg !117 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 16, !dbg !117 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 17, !dbg !117 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 18, !dbg !117 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 19, !dbg !117 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 20, !dbg !117 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 21, !dbg !117 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 22, !dbg !117 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 23, !dbg !117 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 24, !dbg !117 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 25, !dbg !117 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 26, !dbg !117 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 27, !dbg !117 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 28, !dbg !117 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 29, !dbg !117 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 30, !dbg !117 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 31, !dbg !117 + %733 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, float %730, float %731, float %732, i64 %695, i64 %700, i1 true) #3, !dbg !117 + %734 = or disjoint i32 %634, 96, !dbg !117 + %735 = add i32 %734, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %736 = lshr exact i32 %735, 4, !dbg !117 + %737 = and i32 %736, 16383, !dbg !117 + %738 = zext nneg i32 %737 to i64, !dbg !117 + %739 = or disjoint i64 %738, 4611686293372403712, !dbg !117 + %740 = add i32 %640, 96, !dbg !117 + %741 = lshr exact i32 %740, 4, !dbg !117 + %742 = and i32 %741, 16383, !dbg !117 + %743 = zext nneg i32 %742 to i64, !dbg !117 + %744 = or disjoint i64 %743, 4611686293338849280, !dbg !117 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 0, !dbg !117 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 1, !dbg !117 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 2, !dbg !117 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 3, !dbg !117 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 4, !dbg !117 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 5, !dbg !117 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 6, !dbg !117 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 7, !dbg !117 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 8, !dbg !117 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 9, !dbg !117 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 10, !dbg !117 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 11, !dbg !117 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 12, !dbg !117 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 13, !dbg !117 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 14, !dbg !117 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 15, !dbg !117 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 16, !dbg !117 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 17, !dbg !117 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 18, !dbg !117 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 19, !dbg !117 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 20, !dbg !117 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 21, !dbg !117 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 22, !dbg !117 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 23, !dbg !117 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 24, !dbg !117 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 25, !dbg !117 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 26, !dbg !117 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 27, !dbg !117 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 28, !dbg !117 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 29, !dbg !117 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 30, !dbg !117 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 31, !dbg !117 + %777 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, float %753, float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, float %762, float %763, float %764, float %765, float %766, float %767, float %768, float %769, float %770, float %771, float %772, float %773, float %774, float %775, float %776, i64 %739, i64 %744, i1 true) #3, !dbg !117 + %778 = or disjoint i32 %634, 16384, !dbg !117 + %779 = add i32 %778, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %780 = lshr exact i32 %779, 4, !dbg !117 + %781 = and i32 %780, 16383, !dbg !117 + %782 = zext nneg i32 %781 to i64, !dbg !117 + %783 = or disjoint i64 %782, 4611686293372403712, !dbg !117 + %784 = add i32 %640, 8192, !dbg !117 + %785 = lshr exact i32 %784, 4, !dbg !117 + %786 = and i32 %785, 16383, !dbg !117 + %787 = zext nneg i32 %786 to i64, !dbg !117 + %788 = or disjoint i64 %787, 4611686293338849280, !dbg !117 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 0, !dbg !117 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 1, !dbg !117 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 2, !dbg !117 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 3, !dbg !117 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 4, !dbg !117 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 5, !dbg !117 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 6, !dbg !117 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 7, !dbg !117 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 8, !dbg !117 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 9, !dbg !117 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 10, !dbg !117 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 11, !dbg !117 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 12, !dbg !117 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 13, !dbg !117 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 14, !dbg !117 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 15, !dbg !117 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 16, !dbg !117 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 17, !dbg !117 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 18, !dbg !117 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 19, !dbg !117 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 20, !dbg !117 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 21, !dbg !117 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 22, !dbg !117 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 23, !dbg !117 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 24, !dbg !117 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 25, !dbg !117 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 26, !dbg !117 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 27, !dbg !117 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 28, !dbg !117 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 29, !dbg !117 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 30, !dbg !117 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 31, !dbg !117 + %821 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %789, float %790, float %791, float %792, float %793, float %794, float %795, float %796, float %797, float %798, float %799, float %800, float %801, float %802, float %803, float %804, float %805, float %806, float %807, float %808, float %809, float %810, float %811, float %812, float %813, float %814, float %815, float %816, float %817, float %818, float %819, float %820, i64 %783, i64 %788, i1 true) #3, !dbg !117 + %822 = or disjoint i32 %634, 16416, !dbg !117 + %823 = add i32 %822, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %824 = lshr exact i32 %823, 4, !dbg !117 + %825 = and i32 %824, 16383, !dbg !117 + %826 = zext nneg i32 %825 to i64, !dbg !117 + %827 = or disjoint i64 %826, 4611686293372403712, !dbg !117 + %828 = add i32 %640, 8224, !dbg !117 + %829 = lshr exact i32 %828, 4, !dbg !117 + %830 = and i32 %829, 16383, !dbg !117 + %831 = zext nneg i32 %830 to i64, !dbg !117 + %832 = or disjoint i64 %831, 4611686293338849280, !dbg !117 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 0, !dbg !117 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 1, !dbg !117 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 2, !dbg !117 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 3, !dbg !117 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 4, !dbg !117 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 5, !dbg !117 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 6, !dbg !117 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 7, !dbg !117 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 8, !dbg !117 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 9, !dbg !117 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 10, !dbg !117 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 11, !dbg !117 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 12, !dbg !117 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 13, !dbg !117 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 14, !dbg !117 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 15, !dbg !117 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 16, !dbg !117 + %850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 17, !dbg !117 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 18, !dbg !117 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 19, !dbg !117 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 20, !dbg !117 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 21, !dbg !117 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 22, !dbg !117 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 23, !dbg !117 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 24, !dbg !117 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 25, !dbg !117 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 26, !dbg !117 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 27, !dbg !117 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 28, !dbg !117 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 29, !dbg !117 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 30, !dbg !117 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 31, !dbg !117 + %865 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %833, float %834, float %835, float %836, float %837, float %838, float %839, float %840, float %841, float %842, float %843, float %844, float %845, float %846, float %847, float %848, float %849, float %850, float %851, float %852, float %853, float %854, float %855, float %856, float %857, float %858, float %859, float %860, float %861, float %862, float %863, float %864, i64 %827, i64 %832, i1 true) #3, !dbg !117 + %866 = or disjoint i32 %634, 16448, !dbg !117 + %867 = add i32 %866, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %868 = lshr exact i32 %867, 4, !dbg !117 + %869 = and i32 %868, 16383, !dbg !117 + %870 = zext nneg i32 %869 to i64, !dbg !117 + %871 = or disjoint i64 %870, 4611686293372403712, !dbg !117 + %872 = add i32 %640, 8256, !dbg !117 + %873 = lshr exact i32 %872, 4, !dbg !117 + %874 = and i32 %873, 16383, !dbg !117 + %875 = zext nneg i32 %874 to i64, !dbg !117 + %876 = or disjoint i64 %875, 4611686293338849280, !dbg !117 + %877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 0, !dbg !117 + %878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 1, !dbg !117 + %879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 2, !dbg !117 + %880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 3, !dbg !117 + %881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 4, !dbg !117 + %882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 5, !dbg !117 + %883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 6, !dbg !117 + %884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 7, !dbg !117 + %885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 8, !dbg !117 + %886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 9, !dbg !117 + %887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 10, !dbg !117 + %888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 11, !dbg !117 + %889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 12, !dbg !117 + %890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 13, !dbg !117 + %891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 14, !dbg !117 + %892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 15, !dbg !117 + %893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 16, !dbg !117 + %894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 17, !dbg !117 + %895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 18, !dbg !117 + %896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 19, !dbg !117 + %897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 20, !dbg !117 + %898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 21, !dbg !117 + %899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 22, !dbg !117 + %900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 23, !dbg !117 + %901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 24, !dbg !117 + %902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 25, !dbg !117 + %903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 26, !dbg !117 + %904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 27, !dbg !117 + %905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 28, !dbg !117 + %906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 29, !dbg !117 + %907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 30, !dbg !117 + %908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 31, !dbg !117 + %909 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %877, float %878, float %879, float %880, float %881, float %882, float %883, float %884, float %885, float %886, float %887, float %888, float %889, float %890, float %891, float %892, float %893, float %894, float %895, float %896, float %897, float %898, float %899, float %900, float %901, float %902, float %903, float %904, float %905, float %906, float %907, float %908, i64 %871, i64 %876, i1 true) #3, !dbg !117 + %910 = or disjoint i32 %634, 16480, !dbg !117 + %911 = add i32 %910, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %912 = lshr exact i32 %911, 4, !dbg !117 + %913 = and i32 %912, 16383, !dbg !117 + %914 = zext nneg i32 %913 to i64, !dbg !117 + %915 = or disjoint i64 %914, 4611686293372403712, !dbg !117 + %916 = add i32 %640, 8288, !dbg !117 + %917 = lshr exact i32 %916, 4, !dbg !117 + %918 = and i32 %917, 16383, !dbg !117 + %919 = zext nneg i32 %918 to i64, !dbg !117 + %920 = or disjoint i64 %919, 4611686293338849280, !dbg !117 + %921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 0, !dbg !117 + %922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 1, !dbg !117 + %923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 2, !dbg !117 + %924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 3, !dbg !117 + %925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 4, !dbg !117 + %926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 5, !dbg !117 + %927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 6, !dbg !117 + %928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 7, !dbg !117 + %929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 8, !dbg !117 + %930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 9, !dbg !117 + %931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 10, !dbg !117 + %932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 11, !dbg !117 + %933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 12, !dbg !117 + %934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 13, !dbg !117 + %935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 14, !dbg !117 + %936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 15, !dbg !117 + %937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 16, !dbg !117 + %938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 17, !dbg !117 + %939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 18, !dbg !117 + %940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 19, !dbg !117 + %941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 20, !dbg !117 + %942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 21, !dbg !117 + %943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 22, !dbg !117 + %944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 23, !dbg !117 + %945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 24, !dbg !117 + %946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 25, !dbg !117 + %947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 26, !dbg !117 + %948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 27, !dbg !117 + %949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 28, !dbg !117 + %950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 29, !dbg !117 + %951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 30, !dbg !117 + %952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 31, !dbg !117 + %953 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %921, float %922, float %923, float %924, float %925, float %926, float %927, float %928, float %929, float %930, float %931, float %932, float %933, float %934, float %935, float %936, float %937, float %938, float %939, float %940, float %941, float %942, float %943, float %944, float %945, float %946, float %947, float %948, float %949, float %950, float %951, float %952, i64 %915, i64 %920, i1 true) #3, !dbg !117 + %954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 0, !dbg !117 + %955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 1, !dbg !117 + %956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 2, !dbg !117 + %957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 3, !dbg !117 + %958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 4, !dbg !117 + %959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 5, !dbg !117 + %960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 6, !dbg !117 + %961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 7, !dbg !117 + %962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 8, !dbg !117 + %963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 9, !dbg !117 + %964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 10, !dbg !117 + %965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 11, !dbg !117 + %966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 12, !dbg !117 + %967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 13, !dbg !117 + %968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 14, !dbg !117 + %969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 15, !dbg !117 + %970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 16, !dbg !117 + %971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 17, !dbg !117 + %972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 18, !dbg !117 + %973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 19, !dbg !117 + %974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 20, !dbg !117 + %975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 21, !dbg !117 + %976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 22, !dbg !117 + %977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 23, !dbg !117 + %978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 24, !dbg !117 + %979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 25, !dbg !117 + %980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 26, !dbg !117 + %981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 27, !dbg !117 + %982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 28, !dbg !117 + %983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 29, !dbg !117 + %984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 30, !dbg !117 + %985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 31, !dbg !117 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !117 + %986 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %954, float %955, float %956, float %957, float %958, float %959, float %960, float %961, float %962, float %963, float %964, float %965, float %966, float %967, float %968, float %969, float %970, float %971, float %972, float %973, float %974, float %975, float %976, float %977, float %978, float %979, float %980, float %981, float %982, float %983, float %984, float %985, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %631, i32 0, i32 0) #3, !dbg !117 + %987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 0, !dbg !117 + %988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 1, !dbg !117 + %989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 2, !dbg !117 + %990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 3, !dbg !117 + %991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 4, !dbg !117 + %992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 5, !dbg !117 + %993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 6, !dbg !117 + %994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 7, !dbg !117 + %995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 8, !dbg !117 + %996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 9, !dbg !117 + %997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 10, !dbg !117 + %998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 11, !dbg !117 + %999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 12, !dbg !117 + %1000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 13, !dbg !117 + %1001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 14, !dbg !117 + %1002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 15, !dbg !117 + %1003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 16, !dbg !117 + %1004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 17, !dbg !117 + %1005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 18, !dbg !117 + %1006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 19, !dbg !117 + %1007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 20, !dbg !117 + %1008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 21, !dbg !117 + %1009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 22, !dbg !117 + %1010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 23, !dbg !117 + %1011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 24, !dbg !117 + %1012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 25, !dbg !117 + %1013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 26, !dbg !117 + %1014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 27, !dbg !117 + %1015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 28, !dbg !117 + %1016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 29, !dbg !117 + %1017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 30, !dbg !117 + %1018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 31, !dbg !117 + %1019 = fmul float %987, 0x3FB6A09E60000000, !dbg !126 + %1020 = fmul float %988, 0x3FB6A09E60000000, !dbg !126 + %1021 = fmul float %989, 0x3FB6A09E60000000, !dbg !126 + %1022 = fmul float %990, 0x3FB6A09E60000000, !dbg !126 + %1023 = fmul float %991, 0x3FB6A09E60000000, !dbg !126 + %1024 = fmul float %992, 0x3FB6A09E60000000, !dbg !126 + %1025 = fmul float %993, 0x3FB6A09E60000000, !dbg !126 + %1026 = fmul float %994, 0x3FB6A09E60000000, !dbg !126 + %1027 = fmul float %995, 0x3FB6A09E60000000, !dbg !126 + %1028 = fmul float %996, 0x3FB6A09E60000000, !dbg !126 + %1029 = fmul float %997, 0x3FB6A09E60000000, !dbg !126 + %1030 = fmul float %998, 0x3FB6A09E60000000, !dbg !126 + %1031 = fmul float %999, 0x3FB6A09E60000000, !dbg !126 + %1032 = fmul float %1000, 0x3FB6A09E60000000, !dbg !126 + %1033 = fmul float %1001, 0x3FB6A09E60000000, !dbg !126 + %1034 = fmul float %1002, 0x3FB6A09E60000000, !dbg !126 + %1035 = fmul float %1003, 0x3FB6A09E60000000, !dbg !126 + %1036 = fmul float %1004, 0x3FB6A09E60000000, !dbg !126 + %1037 = fmul float %1005, 0x3FB6A09E60000000, !dbg !126 + %1038 = fmul float %1006, 0x3FB6A09E60000000, !dbg !126 + %1039 = fmul float %1007, 0x3FB6A09E60000000, !dbg !126 + %1040 = fmul float %1008, 0x3FB6A09E60000000, !dbg !126 + %1041 = fmul float %1009, 0x3FB6A09E60000000, !dbg !126 + %1042 = fmul float %1010, 0x3FB6A09E60000000, !dbg !126 + %1043 = fmul float %1011, 0x3FB6A09E60000000, !dbg !126 + %1044 = fmul float %1012, 0x3FB6A09E60000000, !dbg !126 + %1045 = fmul float %1013, 0x3FB6A09E60000000, !dbg !126 + %1046 = fmul float %1014, 0x3FB6A09E60000000, !dbg !126 + %1047 = fmul float %1015, 0x3FB6A09E60000000, !dbg !126 + %1048 = fmul float %1016, 0x3FB6A09E60000000, !dbg !126 + %1049 = fmul float %1017, 0x3FB6A09E60000000, !dbg !126 + %1050 = fmul float %1018, 0x3FB6A09E60000000, !dbg !126 + %1051 = srem <2 x i32> %616, %531, !dbg !118 + %1052 = icmp sge <2 x i32> %525, %1051, !dbg !122 + %1053 = and <2 x i1> %523, %1052, !dbg !121 + %1054 = icmp sge <2 x i32> %1051, %527, !dbg !123 + %1055 = srem <2 x i32> %1051, %527, !dbg !127 + %1056 = icmp ne <2 x i32> %1055, zeroinitializer, !dbg !128 + %1057 = extractelement <2 x i32> %1055, i64 0, !dbg !129 + %1058 = xor i32 %1057, %26, !dbg !129 + %1059 = extractelement <2 x i32> %1055, i64 1, !dbg !129 + %1060 = xor i32 %1059, %26, !dbg !129 + %1061 = insertelement <2 x i32> poison, i32 %1058, i64 0, !dbg !129 + %1062 = insertelement <2 x i32> %1061, i32 %1060, i64 1, !dbg !129 + %1063 = icmp slt <2 x i32> %1062, zeroinitializer, !dbg !129 + %1064 = and <2 x i1> %1056, %1063, !dbg !130 + %1065 = select <2 x i1> %1064, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1066 = add <2 x i32> %1065, %1055, !dbg !131 + %1067 = sext <2 x i32> %1066 to <2 x i64>, !dbg !124 + %1068 = icmp sgt <2 x i64> %529, %1067, !dbg !124 + %1069 = and <2 x i1> %1054, %1068, !dbg !132 + %1070 = sub <2 x i32> %1051, %525, !dbg !133 + %1071 = srem <2 x i32> %1070, %527, !dbg !134 + %1072 = icmp ne <2 x i32> %1071, zeroinitializer, !dbg !135 + %1073 = xor <2 x i32> %1071, %527, !dbg !136 + %1074 = icmp slt <2 x i32> %1073, zeroinitializer, !dbg !136 + %1075 = and <2 x i1> %1072, %1074, !dbg !137 + %1076 = select <2 x i1> %1075, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1077 = sub <2 x i32> zeroinitializer, %1076, !dbg !139 + %1078 = icmp eq <2 x i32> %1071, %1077, !dbg !139 + %1079 = and <2 x i1> %1069, %1078, !dbg !140 + %1080 = or <2 x i1> %1053, %1079, !dbg !141 + %1081 = icmp sge <2 x i32> %537, %1051, !dbg !122 + %1082 = and <2 x i1> %535, %1081, !dbg !121 + %1083 = sub <2 x i32> %1051, %537, !dbg !133 + %1084 = srem <2 x i32> %1083, %527, !dbg !134 + %1085 = icmp ne <2 x i32> %1084, zeroinitializer, !dbg !135 + %1086 = xor <2 x i32> %1084, %527, !dbg !136 + %1087 = icmp slt <2 x i32> %1086, zeroinitializer, !dbg !136 + %1088 = and <2 x i1> %1085, %1087, !dbg !137 + %1089 = select <2 x i1> %1088, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1090 = sub <2 x i32> zeroinitializer, %1089, !dbg !139 + %1091 = icmp eq <2 x i32> %1084, %1090, !dbg !139 + %1092 = and <2 x i1> %1069, %1091, !dbg !140 + %1093 = or <2 x i1> %1082, %1092, !dbg !141 + %1094 = select <2 x i1> %1093, <2 x i1> %622, <2 x i1> zeroinitializer, !dbg !142 + %1095 = select <2 x i1> %1080, <2 x i1> %622, <2 x i1> zeroinitializer, !dbg !142 + %1096 = srem <2 x i32> %615, %531, !dbg !118 + %1097 = icmp sge <2 x i32> %525, %1096, !dbg !122 + %1098 = and <2 x i1> %523, %1097, !dbg !121 + %1099 = icmp sge <2 x i32> %1096, %527, !dbg !123 + %1100 = srem <2 x i32> %1096, %527, !dbg !127 + %1101 = icmp ne <2 x i32> %1100, zeroinitializer, !dbg !128 + %1102 = extractelement <2 x i32> %1100, i64 0, !dbg !129 + %1103 = xor i32 %1102, %26, !dbg !129 + %1104 = extractelement <2 x i32> %1100, i64 1, !dbg !129 + %1105 = xor i32 %1104, %26, !dbg !129 + %1106 = insertelement <2 x i32> poison, i32 %1103, i64 0, !dbg !129 + %1107 = insertelement <2 x i32> %1106, i32 %1105, i64 1, !dbg !129 + %1108 = icmp slt <2 x i32> %1107, zeroinitializer, !dbg !129 + %1109 = and <2 x i1> %1101, %1108, !dbg !130 + %1110 = select <2 x i1> %1109, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1111 = add <2 x i32> %1110, %1100, !dbg !131 + %1112 = sext <2 x i32> %1111 to <2 x i64>, !dbg !124 + %1113 = icmp sgt <2 x i64> %529, %1112, !dbg !124 + %1114 = and <2 x i1> %1099, %1113, !dbg !132 + %1115 = sub <2 x i32> %1096, %525, !dbg !133 + %1116 = srem <2 x i32> %1115, %527, !dbg !134 + %1117 = icmp ne <2 x i32> %1116, zeroinitializer, !dbg !135 + %1118 = xor <2 x i32> %1116, %527, !dbg !136 + %1119 = icmp slt <2 x i32> %1118, zeroinitializer, !dbg !136 + %1120 = and <2 x i1> %1117, %1119, !dbg !137 + %1121 = select <2 x i1> %1120, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1122 = sub <2 x i32> zeroinitializer, %1121, !dbg !139 + %1123 = icmp eq <2 x i32> %1116, %1122, !dbg !139 + %1124 = and <2 x i1> %1114, %1123, !dbg !140 + %1125 = or <2 x i1> %1098, %1124, !dbg !141 + %1126 = icmp sge <2 x i32> %537, %1096, !dbg !122 + %1127 = and <2 x i1> %535, %1126, !dbg !121 + %1128 = sub <2 x i32> %1096, %537, !dbg !133 + %1129 = srem <2 x i32> %1128, %527, !dbg !134 + %1130 = icmp ne <2 x i32> %1129, zeroinitializer, !dbg !135 + %1131 = xor <2 x i32> %1129, %527, !dbg !136 + %1132 = icmp slt <2 x i32> %1131, zeroinitializer, !dbg !136 + %1133 = and <2 x i1> %1130, %1132, !dbg !137 + %1134 = select <2 x i1> %1133, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1135 = sub <2 x i32> zeroinitializer, %1134, !dbg !139 + %1136 = icmp eq <2 x i32> %1129, %1135, !dbg !139 + %1137 = and <2 x i1> %1114, %1136, !dbg !140 + %1138 = or <2 x i1> %1127, %1137, !dbg !141 + %1139 = select <2 x i1> %1138, <2 x i1> %623, <2 x i1> zeroinitializer, !dbg !142 + %1140 = select <2 x i1> %1125, <2 x i1> %623, <2 x i1> zeroinitializer, !dbg !142 + %1141 = srem <2 x i32> %614, %531, !dbg !118 + %1142 = icmp sge <2 x i32> %525, %1141, !dbg !122 + %1143 = and <2 x i1> %523, %1142, !dbg !121 + %1144 = icmp sge <2 x i32> %1141, %527, !dbg !123 + %1145 = srem <2 x i32> %1141, %527, !dbg !127 + %1146 = icmp ne <2 x i32> %1145, zeroinitializer, !dbg !128 + %1147 = extractelement <2 x i32> %1145, i64 0, !dbg !129 + %1148 = xor i32 %1147, %26, !dbg !129 + %1149 = extractelement <2 x i32> %1145, i64 1, !dbg !129 + %1150 = xor i32 %1149, %26, !dbg !129 + %1151 = insertelement <2 x i32> poison, i32 %1148, i64 0, !dbg !129 + %1152 = insertelement <2 x i32> %1151, i32 %1150, i64 1, !dbg !129 + %1153 = icmp slt <2 x i32> %1152, zeroinitializer, !dbg !129 + %1154 = and <2 x i1> %1146, %1153, !dbg !130 + %1155 = select <2 x i1> %1154, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1156 = add <2 x i32> %1155, %1145, !dbg !131 + %1157 = sext <2 x i32> %1156 to <2 x i64>, !dbg !124 + %1158 = icmp sgt <2 x i64> %529, %1157, !dbg !124 + %1159 = and <2 x i1> %1144, %1158, !dbg !132 + %1160 = sub <2 x i32> %1141, %525, !dbg !133 + %1161 = srem <2 x i32> %1160, %527, !dbg !134 + %1162 = icmp ne <2 x i32> %1161, zeroinitializer, !dbg !135 + %1163 = xor <2 x i32> %1161, %527, !dbg !136 + %1164 = icmp slt <2 x i32> %1163, zeroinitializer, !dbg !136 + %1165 = and <2 x i1> %1162, %1164, !dbg !137 + %1166 = select <2 x i1> %1165, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1167 = sub <2 x i32> zeroinitializer, %1166, !dbg !139 + %1168 = icmp eq <2 x i32> %1161, %1167, !dbg !139 + %1169 = and <2 x i1> %1159, %1168, !dbg !140 + %1170 = or <2 x i1> %1143, %1169, !dbg !141 + %1171 = icmp sge <2 x i32> %537, %1141, !dbg !122 + %1172 = and <2 x i1> %535, %1171, !dbg !121 + %1173 = sub <2 x i32> %1141, %537, !dbg !133 + %1174 = srem <2 x i32> %1173, %527, !dbg !134 + %1175 = icmp ne <2 x i32> %1174, zeroinitializer, !dbg !135 + %1176 = xor <2 x i32> %1174, %527, !dbg !136 + %1177 = icmp slt <2 x i32> %1176, zeroinitializer, !dbg !136 + %1178 = and <2 x i1> %1175, %1177, !dbg !137 + %1179 = select <2 x i1> %1178, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1180 = sub <2 x i32> zeroinitializer, %1179, !dbg !139 + %1181 = icmp eq <2 x i32> %1174, %1180, !dbg !139 + %1182 = and <2 x i1> %1159, %1181, !dbg !140 + %1183 = or <2 x i1> %1172, %1182, !dbg !141 + %1184 = select <2 x i1> %1183, <2 x i1> %624, <2 x i1> zeroinitializer, !dbg !142 + %1185 = select <2 x i1> %1170, <2 x i1> %624, <2 x i1> zeroinitializer, !dbg !142 + %1186 = srem <2 x i32> %613, %531, !dbg !118 + %1187 = icmp sge <2 x i32> %525, %1186, !dbg !122 + %1188 = and <2 x i1> %523, %1187, !dbg !121 + %1189 = icmp sge <2 x i32> %1186, %527, !dbg !123 + %1190 = srem <2 x i32> %1186, %527, !dbg !127 + %1191 = icmp ne <2 x i32> %1190, zeroinitializer, !dbg !128 + %1192 = extractelement <2 x i32> %1190, i64 0, !dbg !129 + %1193 = xor i32 %1192, %26, !dbg !129 + %1194 = extractelement <2 x i32> %1190, i64 1, !dbg !129 + %1195 = xor i32 %1194, %26, !dbg !129 + %1196 = insertelement <2 x i32> poison, i32 %1193, i64 0, !dbg !129 + %1197 = insertelement <2 x i32> %1196, i32 %1195, i64 1, !dbg !129 + %1198 = icmp slt <2 x i32> %1197, zeroinitializer, !dbg !129 + %1199 = and <2 x i1> %1191, %1198, !dbg !130 + %1200 = select <2 x i1> %1199, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1201 = add <2 x i32> %1200, %1190, !dbg !131 + %1202 = sext <2 x i32> %1201 to <2 x i64>, !dbg !124 + %1203 = icmp sgt <2 x i64> %529, %1202, !dbg !124 + %1204 = and <2 x i1> %1189, %1203, !dbg !132 + %1205 = sub <2 x i32> %1186, %525, !dbg !133 + %1206 = srem <2 x i32> %1205, %527, !dbg !134 + %1207 = icmp ne <2 x i32> %1206, zeroinitializer, !dbg !135 + %1208 = xor <2 x i32> %1206, %527, !dbg !136 + %1209 = icmp slt <2 x i32> %1208, zeroinitializer, !dbg !136 + %1210 = and <2 x i1> %1207, %1209, !dbg !137 + %1211 = select <2 x i1> %1210, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1212 = sub <2 x i32> zeroinitializer, %1211, !dbg !139 + %1213 = icmp eq <2 x i32> %1206, %1212, !dbg !139 + %1214 = and <2 x i1> %1204, %1213, !dbg !140 + %1215 = or <2 x i1> %1188, %1214, !dbg !141 + %1216 = icmp sge <2 x i32> %537, %1186, !dbg !122 + %1217 = and <2 x i1> %535, %1216, !dbg !121 + %1218 = sub <2 x i32> %1186, %537, !dbg !133 + %1219 = srem <2 x i32> %1218, %527, !dbg !134 + %1220 = icmp ne <2 x i32> %1219, zeroinitializer, !dbg !135 + %1221 = xor <2 x i32> %1219, %527, !dbg !136 + %1222 = icmp slt <2 x i32> %1221, zeroinitializer, !dbg !136 + %1223 = and <2 x i1> %1220, %1222, !dbg !137 + %1224 = select <2 x i1> %1223, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1225 = sub <2 x i32> zeroinitializer, %1224, !dbg !139 + %1226 = icmp eq <2 x i32> %1219, %1225, !dbg !139 + %1227 = and <2 x i1> %1204, %1226, !dbg !140 + %1228 = or <2 x i1> %1217, %1227, !dbg !141 + %1229 = select <2 x i1> %1228, <2 x i1> %625, <2 x i1> zeroinitializer, !dbg !142 + %1230 = select <2 x i1> %1215, <2 x i1> %625, <2 x i1> zeroinitializer, !dbg !142 + %1231 = srem <2 x i32> %612, %531, !dbg !118 + %1232 = icmp sge <2 x i32> %525, %1231, !dbg !122 + %1233 = and <2 x i1> %523, %1232, !dbg !121 + %1234 = icmp sge <2 x i32> %1231, %527, !dbg !123 + %1235 = srem <2 x i32> %1231, %527, !dbg !127 + %1236 = icmp ne <2 x i32> %1235, zeroinitializer, !dbg !128 + %1237 = extractelement <2 x i32> %1235, i64 0, !dbg !129 + %1238 = xor i32 %1237, %26, !dbg !129 + %1239 = extractelement <2 x i32> %1235, i64 1, !dbg !129 + %1240 = xor i32 %1239, %26, !dbg !129 + %1241 = insertelement <2 x i32> poison, i32 %1238, i64 0, !dbg !129 + %1242 = insertelement <2 x i32> %1241, i32 %1240, i64 1, !dbg !129 + %1243 = icmp slt <2 x i32> %1242, zeroinitializer, !dbg !129 + %1244 = and <2 x i1> %1236, %1243, !dbg !130 + %1245 = select <2 x i1> %1244, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1246 = add <2 x i32> %1245, %1235, !dbg !131 + %1247 = sext <2 x i32> %1246 to <2 x i64>, !dbg !124 + %1248 = icmp sgt <2 x i64> %529, %1247, !dbg !124 + %1249 = and <2 x i1> %1234, %1248, !dbg !132 + %1250 = sub <2 x i32> %1231, %525, !dbg !133 + %1251 = srem <2 x i32> %1250, %527, !dbg !134 + %1252 = icmp ne <2 x i32> %1251, zeroinitializer, !dbg !135 + %1253 = xor <2 x i32> %1251, %527, !dbg !136 + %1254 = icmp slt <2 x i32> %1253, zeroinitializer, !dbg !136 + %1255 = and <2 x i1> %1252, %1254, !dbg !137 + %1256 = select <2 x i1> %1255, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1257 = sub <2 x i32> zeroinitializer, %1256, !dbg !139 + %1258 = icmp eq <2 x i32> %1251, %1257, !dbg !139 + %1259 = and <2 x i1> %1249, %1258, !dbg !140 + %1260 = or <2 x i1> %1233, %1259, !dbg !141 + %1261 = icmp sge <2 x i32> %537, %1231, !dbg !122 + %1262 = and <2 x i1> %535, %1261, !dbg !121 + %1263 = sub <2 x i32> %1231, %537, !dbg !133 + %1264 = srem <2 x i32> %1263, %527, !dbg !134 + %1265 = icmp ne <2 x i32> %1264, zeroinitializer, !dbg !135 + %1266 = xor <2 x i32> %1264, %527, !dbg !136 + %1267 = icmp slt <2 x i32> %1266, zeroinitializer, !dbg !136 + %1268 = and <2 x i1> %1265, %1267, !dbg !137 + %1269 = select <2 x i1> %1268, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1270 = sub <2 x i32> zeroinitializer, %1269, !dbg !139 + %1271 = icmp eq <2 x i32> %1264, %1270, !dbg !139 + %1272 = and <2 x i1> %1249, %1271, !dbg !140 + %1273 = or <2 x i1> %1262, %1272, !dbg !141 + %1274 = select <2 x i1> %1273, <2 x i1> %626, <2 x i1> zeroinitializer, !dbg !142 + %1275 = select <2 x i1> %1260, <2 x i1> %626, <2 x i1> zeroinitializer, !dbg !142 + %1276 = srem <2 x i32> %611, %531, !dbg !118 + %1277 = icmp sge <2 x i32> %525, %1276, !dbg !122 + %1278 = and <2 x i1> %523, %1277, !dbg !121 + %1279 = icmp sge <2 x i32> %1276, %527, !dbg !123 + %1280 = srem <2 x i32> %1276, %527, !dbg !127 + %1281 = icmp ne <2 x i32> %1280, zeroinitializer, !dbg !128 + %1282 = extractelement <2 x i32> %1280, i64 0, !dbg !129 + %1283 = xor i32 %1282, %26, !dbg !129 + %1284 = extractelement <2 x i32> %1280, i64 1, !dbg !129 + %1285 = xor i32 %1284, %26, !dbg !129 + %1286 = insertelement <2 x i32> poison, i32 %1283, i64 0, !dbg !129 + %1287 = insertelement <2 x i32> %1286, i32 %1285, i64 1, !dbg !129 + %1288 = icmp slt <2 x i32> %1287, zeroinitializer, !dbg !129 + %1289 = and <2 x i1> %1281, %1288, !dbg !130 + %1290 = select <2 x i1> %1289, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1291 = add <2 x i32> %1290, %1280, !dbg !131 + %1292 = sext <2 x i32> %1291 to <2 x i64>, !dbg !124 + %1293 = icmp sgt <2 x i64> %529, %1292, !dbg !124 + %1294 = and <2 x i1> %1279, %1293, !dbg !132 + %1295 = sub <2 x i32> %1276, %525, !dbg !133 + %1296 = srem <2 x i32> %1295, %527, !dbg !134 + %1297 = icmp ne <2 x i32> %1296, zeroinitializer, !dbg !135 + %1298 = xor <2 x i32> %1296, %527, !dbg !136 + %1299 = icmp slt <2 x i32> %1298, zeroinitializer, !dbg !136 + %1300 = and <2 x i1> %1297, %1299, !dbg !137 + %1301 = select <2 x i1> %1300, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1302 = sub <2 x i32> zeroinitializer, %1301, !dbg !139 + %1303 = icmp eq <2 x i32> %1296, %1302, !dbg !139 + %1304 = and <2 x i1> %1294, %1303, !dbg !140 + %1305 = or <2 x i1> %1278, %1304, !dbg !141 + %1306 = icmp sge <2 x i32> %537, %1276, !dbg !122 + %1307 = and <2 x i1> %535, %1306, !dbg !121 + %1308 = sub <2 x i32> %1276, %537, !dbg !133 + %1309 = srem <2 x i32> %1308, %527, !dbg !134 + %1310 = icmp ne <2 x i32> %1309, zeroinitializer, !dbg !135 + %1311 = xor <2 x i32> %1309, %527, !dbg !136 + %1312 = icmp slt <2 x i32> %1311, zeroinitializer, !dbg !136 + %1313 = and <2 x i1> %1310, %1312, !dbg !137 + %1314 = select <2 x i1> %1313, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1315 = sub <2 x i32> zeroinitializer, %1314, !dbg !139 + %1316 = icmp eq <2 x i32> %1309, %1315, !dbg !139 + %1317 = and <2 x i1> %1294, %1316, !dbg !140 + %1318 = or <2 x i1> %1307, %1317, !dbg !141 + %1319 = select <2 x i1> %1318, <2 x i1> %627, <2 x i1> zeroinitializer, !dbg !142 + %1320 = select <2 x i1> %1305, <2 x i1> %627, <2 x i1> zeroinitializer, !dbg !142 + %1321 = srem <2 x i32> %610, %531, !dbg !118 + %1322 = icmp sge <2 x i32> %525, %1321, !dbg !122 + %1323 = and <2 x i1> %523, %1322, !dbg !121 + %1324 = icmp sge <2 x i32> %1321, %527, !dbg !123 + %1325 = srem <2 x i32> %1321, %527, !dbg !127 + %1326 = icmp ne <2 x i32> %1325, zeroinitializer, !dbg !128 + %1327 = extractelement <2 x i32> %1325, i64 0, !dbg !129 + %1328 = xor i32 %1327, %26, !dbg !129 + %1329 = extractelement <2 x i32> %1325, i64 1, !dbg !129 + %1330 = xor i32 %1329, %26, !dbg !129 + %1331 = insertelement <2 x i32> poison, i32 %1328, i64 0, !dbg !129 + %1332 = insertelement <2 x i32> %1331, i32 %1330, i64 1, !dbg !129 + %1333 = icmp slt <2 x i32> %1332, zeroinitializer, !dbg !129 + %1334 = and <2 x i1> %1326, %1333, !dbg !130 + %1335 = select <2 x i1> %1334, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1336 = add <2 x i32> %1335, %1325, !dbg !131 + %1337 = sext <2 x i32> %1336 to <2 x i64>, !dbg !124 + %1338 = icmp sgt <2 x i64> %529, %1337, !dbg !124 + %1339 = and <2 x i1> %1324, %1338, !dbg !132 + %1340 = sub <2 x i32> %1321, %525, !dbg !133 + %1341 = srem <2 x i32> %1340, %527, !dbg !134 + %1342 = icmp ne <2 x i32> %1341, zeroinitializer, !dbg !135 + %1343 = xor <2 x i32> %1341, %527, !dbg !136 + %1344 = icmp slt <2 x i32> %1343, zeroinitializer, !dbg !136 + %1345 = and <2 x i1> %1342, %1344, !dbg !137 + %1346 = select <2 x i1> %1345, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1347 = sub <2 x i32> zeroinitializer, %1346, !dbg !139 + %1348 = icmp eq <2 x i32> %1341, %1347, !dbg !139 + %1349 = and <2 x i1> %1339, %1348, !dbg !140 + %1350 = or <2 x i1> %1323, %1349, !dbg !141 + %1351 = icmp sge <2 x i32> %537, %1321, !dbg !122 + %1352 = and <2 x i1> %535, %1351, !dbg !121 + %1353 = sub <2 x i32> %1321, %537, !dbg !133 + %1354 = srem <2 x i32> %1353, %527, !dbg !134 + %1355 = icmp ne <2 x i32> %1354, zeroinitializer, !dbg !135 + %1356 = xor <2 x i32> %1354, %527, !dbg !136 + %1357 = icmp slt <2 x i32> %1356, zeroinitializer, !dbg !136 + %1358 = and <2 x i1> %1355, %1357, !dbg !137 + %1359 = select <2 x i1> %1358, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1360 = sub <2 x i32> zeroinitializer, %1359, !dbg !139 + %1361 = icmp eq <2 x i32> %1354, %1360, !dbg !139 + %1362 = and <2 x i1> %1339, %1361, !dbg !140 + %1363 = or <2 x i1> %1352, %1362, !dbg !141 + %1364 = select <2 x i1> %1363, <2 x i1> %628, <2 x i1> zeroinitializer, !dbg !142 + %1365 = select <2 x i1> %1350, <2 x i1> %628, <2 x i1> zeroinitializer, !dbg !142 + %1366 = srem <2 x i32> %609, %531, !dbg !118 + %1367 = icmp sge <2 x i32> %525, %1366, !dbg !122 + %1368 = and <2 x i1> %523, %1367, !dbg !121 + %1369 = icmp sge <2 x i32> %1366, %527, !dbg !123 + %1370 = srem <2 x i32> %1366, %527, !dbg !127 + %1371 = icmp ne <2 x i32> %1370, zeroinitializer, !dbg !128 + %1372 = extractelement <2 x i32> %1370, i64 0, !dbg !129 + %1373 = xor i32 %1372, %26, !dbg !129 + %1374 = extractelement <2 x i32> %1370, i64 1, !dbg !129 + %1375 = xor i32 %1374, %26, !dbg !129 + %1376 = insertelement <2 x i32> poison, i32 %1373, i64 0, !dbg !129 + %1377 = insertelement <2 x i32> %1376, i32 %1375, i64 1, !dbg !129 + %1378 = icmp slt <2 x i32> %1377, zeroinitializer, !dbg !129 + %1379 = and <2 x i1> %1371, %1378, !dbg !130 + %1380 = select <2 x i1> %1379, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1381 = add <2 x i32> %1380, %1370, !dbg !131 + %1382 = sext <2 x i32> %1381 to <2 x i64>, !dbg !124 + %1383 = icmp sgt <2 x i64> %529, %1382, !dbg !124 + %1384 = and <2 x i1> %1369, %1383, !dbg !132 + %1385 = sub <2 x i32> %1366, %525, !dbg !133 + %1386 = srem <2 x i32> %1385, %527, !dbg !134 + %1387 = icmp ne <2 x i32> %1386, zeroinitializer, !dbg !135 + %1388 = xor <2 x i32> %1386, %527, !dbg !136 + %1389 = icmp slt <2 x i32> %1388, zeroinitializer, !dbg !136 + %1390 = and <2 x i1> %1387, %1389, !dbg !137 + %1391 = select <2 x i1> %1390, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1392 = sub <2 x i32> zeroinitializer, %1391, !dbg !139 + %1393 = icmp eq <2 x i32> %1386, %1392, !dbg !139 + %1394 = and <2 x i1> %1384, %1393, !dbg !140 + %1395 = or <2 x i1> %1368, %1394, !dbg !141 + %1396 = icmp sge <2 x i32> %537, %1366, !dbg !122 + %1397 = and <2 x i1> %535, %1396, !dbg !121 + %1398 = sub <2 x i32> %1366, %537, !dbg !133 + %1399 = srem <2 x i32> %1398, %527, !dbg !134 + %1400 = icmp ne <2 x i32> %1399, zeroinitializer, !dbg !135 + %1401 = xor <2 x i32> %1399, %527, !dbg !136 + %1402 = icmp slt <2 x i32> %1401, zeroinitializer, !dbg !136 + %1403 = and <2 x i1> %1400, %1402, !dbg !137 + %1404 = select <2 x i1> %1403, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1405 = sub <2 x i32> zeroinitializer, %1404, !dbg !139 + %1406 = icmp eq <2 x i32> %1399, %1405, !dbg !139 + %1407 = and <2 x i1> %1384, %1406, !dbg !140 + %1408 = or <2 x i1> %1397, %1407, !dbg !141 + %1409 = select <2 x i1> %1408, <2 x i1> %629, <2 x i1> zeroinitializer, !dbg !142 + %1410 = select <2 x i1> %1395, <2 x i1> %629, <2 x i1> zeroinitializer, !dbg !142 + %1411 = fmul float %1019, 0x3FF7154760000000, !dbg !143 + %1412 = extractelement <2 x i1> %1094, i64 0, !dbg !142 + %1413 = select i1 %1412, float %1411, float 0xFFF0000000000000, !dbg !142 + %1414 = fmul float %1020, 0x3FF7154760000000, !dbg !143 + %1415 = extractelement <2 x i1> %1094, i64 1, !dbg !142 + %1416 = select i1 %1415, float %1414, float 0xFFF0000000000000, !dbg !142 + %1417 = fmul float %1021, 0x3FF7154760000000, !dbg !143 + %1418 = extractelement <2 x i1> %1095, i64 0, !dbg !142 + %1419 = select i1 %1418, float %1417, float 0xFFF0000000000000, !dbg !142 + %1420 = fmul float %1022, 0x3FF7154760000000, !dbg !143 + %1421 = extractelement <2 x i1> %1095, i64 1, !dbg !142 + %1422 = select i1 %1421, float %1420, float 0xFFF0000000000000, !dbg !142 + %1423 = fmul float %1023, 0x3FF7154760000000, !dbg !143 + %1424 = extractelement <2 x i1> %1139, i64 0, !dbg !142 + %1425 = select i1 %1424, float %1423, float 0xFFF0000000000000, !dbg !142 + %1426 = fmul float %1024, 0x3FF7154760000000, !dbg !143 + %1427 = extractelement <2 x i1> %1139, i64 1, !dbg !142 + %1428 = select i1 %1427, float %1426, float 0xFFF0000000000000, !dbg !142 + %1429 = fmul float %1025, 0x3FF7154760000000, !dbg !143 + %1430 = extractelement <2 x i1> %1140, i64 0, !dbg !142 + %1431 = select i1 %1430, float %1429, float 0xFFF0000000000000, !dbg !142 + %1432 = fmul float %1026, 0x3FF7154760000000, !dbg !143 + %1433 = extractelement <2 x i1> %1140, i64 1, !dbg !142 + %1434 = select i1 %1433, float %1432, float 0xFFF0000000000000, !dbg !142 + %1435 = fmul float %1027, 0x3FF7154760000000, !dbg !143 + %1436 = extractelement <2 x i1> %1184, i64 0, !dbg !142 + %1437 = select i1 %1436, float %1435, float 0xFFF0000000000000, !dbg !142 + %1438 = fmul float %1028, 0x3FF7154760000000, !dbg !143 + %1439 = extractelement <2 x i1> %1184, i64 1, !dbg !142 + %1440 = select i1 %1439, float %1438, float 0xFFF0000000000000, !dbg !142 + %1441 = fmul float %1029, 0x3FF7154760000000, !dbg !143 + %1442 = extractelement <2 x i1> %1185, i64 0, !dbg !142 + %1443 = select i1 %1442, float %1441, float 0xFFF0000000000000, !dbg !142 + %1444 = fmul float %1030, 0x3FF7154760000000, !dbg !143 + %1445 = extractelement <2 x i1> %1185, i64 1, !dbg !142 + %1446 = select i1 %1445, float %1444, float 0xFFF0000000000000, !dbg !142 + %1447 = fmul float %1031, 0x3FF7154760000000, !dbg !143 + %1448 = extractelement <2 x i1> %1229, i64 0, !dbg !142 + %1449 = select i1 %1448, float %1447, float 0xFFF0000000000000, !dbg !142 + %1450 = fmul float %1032, 0x3FF7154760000000, !dbg !143 + %1451 = extractelement <2 x i1> %1229, i64 1, !dbg !142 + %1452 = select i1 %1451, float %1450, float 0xFFF0000000000000, !dbg !142 + %1453 = fmul float %1033, 0x3FF7154760000000, !dbg !143 + %1454 = extractelement <2 x i1> %1230, i64 0, !dbg !142 + %1455 = select i1 %1454, float %1453, float 0xFFF0000000000000, !dbg !142 + %1456 = fmul float %1034, 0x3FF7154760000000, !dbg !143 + %1457 = extractelement <2 x i1> %1230, i64 1, !dbg !142 + %1458 = select i1 %1457, float %1456, float 0xFFF0000000000000, !dbg !142 + %1459 = fmul float %1035, 0x3FF7154760000000, !dbg !143 + %1460 = extractelement <2 x i1> %1274, i64 0, !dbg !142 + %1461 = select i1 %1460, float %1459, float 0xFFF0000000000000, !dbg !142 + %1462 = fmul float %1036, 0x3FF7154760000000, !dbg !143 + %1463 = extractelement <2 x i1> %1274, i64 1, !dbg !142 + %1464 = select i1 %1463, float %1462, float 0xFFF0000000000000, !dbg !142 + %1465 = fmul float %1037, 0x3FF7154760000000, !dbg !143 + %1466 = extractelement <2 x i1> %1275, i64 0, !dbg !142 + %1467 = select i1 %1466, float %1465, float 0xFFF0000000000000, !dbg !142 + %1468 = fmul float %1038, 0x3FF7154760000000, !dbg !143 + %1469 = extractelement <2 x i1> %1275, i64 1, !dbg !142 + %1470 = select i1 %1469, float %1468, float 0xFFF0000000000000, !dbg !142 + %1471 = fmul float %1039, 0x3FF7154760000000, !dbg !143 + %1472 = extractelement <2 x i1> %1319, i64 0, !dbg !142 + %1473 = select i1 %1472, float %1471, float 0xFFF0000000000000, !dbg !142 + %1474 = fmul float %1040, 0x3FF7154760000000, !dbg !143 + %1475 = extractelement <2 x i1> %1319, i64 1, !dbg !142 + %1476 = select i1 %1475, float %1474, float 0xFFF0000000000000, !dbg !142 + %1477 = fmul float %1041, 0x3FF7154760000000, !dbg !143 + %1478 = extractelement <2 x i1> %1320, i64 0, !dbg !142 + %1479 = select i1 %1478, float %1477, float 0xFFF0000000000000, !dbg !142 + %1480 = fmul float %1042, 0x3FF7154760000000, !dbg !143 + %1481 = extractelement <2 x i1> %1320, i64 1, !dbg !142 + %1482 = select i1 %1481, float %1480, float 0xFFF0000000000000, !dbg !142 + %1483 = fmul float %1043, 0x3FF7154760000000, !dbg !143 + %1484 = extractelement <2 x i1> %1364, i64 0, !dbg !142 + %1485 = select i1 %1484, float %1483, float 0xFFF0000000000000, !dbg !142 + %1486 = fmul float %1044, 0x3FF7154760000000, !dbg !143 + %1487 = extractelement <2 x i1> %1364, i64 1, !dbg !142 + %1488 = select i1 %1487, float %1486, float 0xFFF0000000000000, !dbg !142 + %1489 = fmul float %1045, 0x3FF7154760000000, !dbg !143 + %1490 = extractelement <2 x i1> %1365, i64 0, !dbg !142 + %1491 = select i1 %1490, float %1489, float 0xFFF0000000000000, !dbg !142 + %1492 = fmul float %1046, 0x3FF7154760000000, !dbg !143 + %1493 = extractelement <2 x i1> %1365, i64 1, !dbg !142 + %1494 = select i1 %1493, float %1492, float 0xFFF0000000000000, !dbg !142 + %1495 = fmul float %1047, 0x3FF7154760000000, !dbg !143 + %1496 = extractelement <2 x i1> %1409, i64 0, !dbg !142 + %1497 = select i1 %1496, float %1495, float 0xFFF0000000000000, !dbg !142 + %1498 = fmul float %1048, 0x3FF7154760000000, !dbg !143 + %1499 = extractelement <2 x i1> %1409, i64 1, !dbg !142 + %1500 = select i1 %1499, float %1498, float 0xFFF0000000000000, !dbg !142 + %1501 = fmul float %1049, 0x3FF7154760000000, !dbg !143 + %1502 = extractelement <2 x i1> %1410, i64 0, !dbg !142 + %1503 = select i1 %1502, float %1501, float 0xFFF0000000000000, !dbg !142 + %1504 = fmul float %1050, 0x3FF7154760000000, !dbg !143 + %1505 = extractelement <2 x i1> %1410, i64 1, !dbg !142 + %1506 = select i1 %1505, float %1504, float 0xFFF0000000000000, !dbg !142 + %1507 = fsub float %1413, %381, !dbg !144 + %1508 = fsub float %1416, %381, !dbg !144 + %1509 = fsub float %1419, %382, !dbg !144 + %1510 = fsub float %1422, %382, !dbg !144 + %1511 = fsub float %1425, %381, !dbg !144 + %1512 = fsub float %1428, %381, !dbg !144 + %1513 = fsub float %1431, %382, !dbg !144 + %1514 = fsub float %1434, %382, !dbg !144 + %1515 = fsub float %1437, %381, !dbg !144 + %1516 = fsub float %1440, %381, !dbg !144 + %1517 = fsub float %1443, %382, !dbg !144 + %1518 = fsub float %1446, %382, !dbg !144 + %1519 = fsub float %1449, %381, !dbg !144 + %1520 = fsub float %1452, %381, !dbg !144 + %1521 = fsub float %1455, %382, !dbg !144 + %1522 = fsub float %1458, %382, !dbg !144 + %1523 = fsub float %1461, %381, !dbg !144 + %1524 = fsub float %1464, %381, !dbg !144 + %1525 = fsub float %1467, %382, !dbg !144 + %1526 = fsub float %1470, %382, !dbg !144 + %1527 = fsub float %1473, %381, !dbg !144 + %1528 = fsub float %1476, %381, !dbg !144 + %1529 = fsub float %1479, %382, !dbg !144 + %1530 = fsub float %1482, %382, !dbg !144 + %1531 = fsub float %1485, %381, !dbg !144 + %1532 = fsub float %1488, %381, !dbg !144 + %1533 = fsub float %1491, %382, !dbg !144 + %1534 = fsub float %1494, %382, !dbg !144 + %1535 = fsub float %1497, %381, !dbg !144 + %1536 = fsub float %1500, %381, !dbg !144 + %1537 = fsub float %1503, %382, !dbg !144 + %1538 = fsub float %1506, %382, !dbg !144 + %1539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1432 = icmp eq i32 %1539, 0, !dbg !145 + br i1 %.not.i1432, label %1542, label %1540, !dbg !145 + +1540: ; preds = %540 + %1541 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1507) #3, !dbg !145 + br label %__nv_exp2f.exit1434, !dbg !145 + +1542: ; preds = %540 + %1543 = tail call float @llvm.nvvm.ex2.approx.f(float %1507) #3, !dbg !145 + br label %__nv_exp2f.exit1434, !dbg !145 + +__nv_exp2f.exit1434: ; preds = %1540, %1542 + %.0.i1433 = phi float [ %1541, %1540 ], [ %1543, %1542 ], !dbg !145 + %1544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1435 = icmp eq i32 %1544, 0, !dbg !145 + br i1 %.not.i1435, label %1547, label %1545, !dbg !145 + +1545: ; preds = %__nv_exp2f.exit1434 + %1546 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1508) #3, !dbg !145 + br label %__nv_exp2f.exit1437, !dbg !145 + +1547: ; preds = %__nv_exp2f.exit1434 + %1548 = tail call float @llvm.nvvm.ex2.approx.f(float %1508) #3, !dbg !145 + br label %__nv_exp2f.exit1437, !dbg !145 + +__nv_exp2f.exit1437: ; preds = %1545, %1547 + %.0.i1436 = phi float [ %1546, %1545 ], [ %1548, %1547 ], !dbg !145 + %1549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1438 = icmp eq i32 %1549, 0, !dbg !145 + br i1 %.not.i1438, label %1552, label %1550, !dbg !145 + +1550: ; preds = %__nv_exp2f.exit1437 + %1551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1509) #3, !dbg !145 + br label %__nv_exp2f.exit1440, !dbg !145 + +1552: ; preds = %__nv_exp2f.exit1437 + %1553 = tail call float @llvm.nvvm.ex2.approx.f(float %1509) #3, !dbg !145 + br label %__nv_exp2f.exit1440, !dbg !145 + +__nv_exp2f.exit1440: ; preds = %1550, %1552 + %.0.i1439 = phi float [ %1551, %1550 ], [ %1553, %1552 ], !dbg !145 + %1554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1441 = icmp eq i32 %1554, 0, !dbg !145 + br i1 %.not.i1441, label %1557, label %1555, !dbg !145 + +1555: ; preds = %__nv_exp2f.exit1440 + %1556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1510) #3, !dbg !145 + br label %__nv_exp2f.exit1443, !dbg !145 + +1557: ; preds = %__nv_exp2f.exit1440 + %1558 = tail call float @llvm.nvvm.ex2.approx.f(float %1510) #3, !dbg !145 + br label %__nv_exp2f.exit1443, !dbg !145 + +__nv_exp2f.exit1443: ; preds = %1555, %1557 + %.0.i1442 = phi float [ %1556, %1555 ], [ %1558, %1557 ], !dbg !145 + %1559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1444 = icmp eq i32 %1559, 0, !dbg !145 + br i1 %.not.i1444, label %1562, label %1560, !dbg !145 + +1560: ; preds = %__nv_exp2f.exit1443 + %1561 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1511) #3, !dbg !145 + br label %__nv_exp2f.exit1446, !dbg !145 + +1562: ; preds = %__nv_exp2f.exit1443 + %1563 = tail call float @llvm.nvvm.ex2.approx.f(float %1511) #3, !dbg !145 + br label %__nv_exp2f.exit1446, !dbg !145 + +__nv_exp2f.exit1446: ; preds = %1560, %1562 + %.0.i1445 = phi float [ %1561, %1560 ], [ %1563, %1562 ], !dbg !145 + %1564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1447 = icmp eq i32 %1564, 0, !dbg !145 + br i1 %.not.i1447, label %1567, label %1565, !dbg !145 + +1565: ; preds = %__nv_exp2f.exit1446 + %1566 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1512) #3, !dbg !145 + br label %__nv_exp2f.exit1449, !dbg !145 + +1567: ; preds = %__nv_exp2f.exit1446 + %1568 = tail call float @llvm.nvvm.ex2.approx.f(float %1512) #3, !dbg !145 + br label %__nv_exp2f.exit1449, !dbg !145 + +__nv_exp2f.exit1449: ; preds = %1565, %1567 + %.0.i1448 = phi float [ %1566, %1565 ], [ %1568, %1567 ], !dbg !145 + %1569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1450 = icmp eq i32 %1569, 0, !dbg !145 + br i1 %.not.i1450, label %1572, label %1570, !dbg !145 + +1570: ; preds = %__nv_exp2f.exit1449 + %1571 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1513) #3, !dbg !145 + br label %__nv_exp2f.exit1452, !dbg !145 + +1572: ; preds = %__nv_exp2f.exit1449 + %1573 = tail call float @llvm.nvvm.ex2.approx.f(float %1513) #3, !dbg !145 + br label %__nv_exp2f.exit1452, !dbg !145 + +__nv_exp2f.exit1452: ; preds = %1570, %1572 + %.0.i1451 = phi float [ %1571, %1570 ], [ %1573, %1572 ], !dbg !145 + %1574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1453 = icmp eq i32 %1574, 0, !dbg !145 + br i1 %.not.i1453, label %1577, label %1575, !dbg !145 + +1575: ; preds = %__nv_exp2f.exit1452 + %1576 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1514) #3, !dbg !145 + br label %__nv_exp2f.exit1455, !dbg !145 + +1577: ; preds = %__nv_exp2f.exit1452 + %1578 = tail call float @llvm.nvvm.ex2.approx.f(float %1514) #3, !dbg !145 + br label %__nv_exp2f.exit1455, !dbg !145 + +__nv_exp2f.exit1455: ; preds = %1575, %1577 + %.0.i1454 = phi float [ %1576, %1575 ], [ %1578, %1577 ], !dbg !145 + %1579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1456 = icmp eq i32 %1579, 0, !dbg !145 + br i1 %.not.i1456, label %1582, label %1580, !dbg !145 + +1580: ; preds = %__nv_exp2f.exit1455 + %1581 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1515) #3, !dbg !145 + br label %__nv_exp2f.exit1458, !dbg !145 + +1582: ; preds = %__nv_exp2f.exit1455 + %1583 = tail call float @llvm.nvvm.ex2.approx.f(float %1515) #3, !dbg !145 + br label %__nv_exp2f.exit1458, !dbg !145 + +__nv_exp2f.exit1458: ; preds = %1580, %1582 + %.0.i1457 = phi float [ %1581, %1580 ], [ %1583, %1582 ], !dbg !145 + %1584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1459 = icmp eq i32 %1584, 0, !dbg !145 + br i1 %.not.i1459, label %1587, label %1585, !dbg !145 + +1585: ; preds = %__nv_exp2f.exit1458 + %1586 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1516) #3, !dbg !145 + br label %__nv_exp2f.exit1461, !dbg !145 + +1587: ; preds = %__nv_exp2f.exit1458 + %1588 = tail call float @llvm.nvvm.ex2.approx.f(float %1516) #3, !dbg !145 + br label %__nv_exp2f.exit1461, !dbg !145 + +__nv_exp2f.exit1461: ; preds = %1585, %1587 + %.0.i1460 = phi float [ %1586, %1585 ], [ %1588, %1587 ], !dbg !145 + %1589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1462 = icmp eq i32 %1589, 0, !dbg !145 + br i1 %.not.i1462, label %1592, label %1590, !dbg !145 + +1590: ; preds = %__nv_exp2f.exit1461 + %1591 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1517) #3, !dbg !145 + br label %__nv_exp2f.exit1464, !dbg !145 + +1592: ; preds = %__nv_exp2f.exit1461 + %1593 = tail call float @llvm.nvvm.ex2.approx.f(float %1517) #3, !dbg !145 + br label %__nv_exp2f.exit1464, !dbg !145 + +__nv_exp2f.exit1464: ; preds = %1590, %1592 + %.0.i1463 = phi float [ %1591, %1590 ], [ %1593, %1592 ], !dbg !145 + %1594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1465 = icmp eq i32 %1594, 0, !dbg !145 + br i1 %.not.i1465, label %1597, label %1595, !dbg !145 + +1595: ; preds = %__nv_exp2f.exit1464 + %1596 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1518) #3, !dbg !145 + br label %__nv_exp2f.exit1467, !dbg !145 + +1597: ; preds = %__nv_exp2f.exit1464 + %1598 = tail call float @llvm.nvvm.ex2.approx.f(float %1518) #3, !dbg !145 + br label %__nv_exp2f.exit1467, !dbg !145 + +__nv_exp2f.exit1467: ; preds = %1595, %1597 + %.0.i1466 = phi float [ %1596, %1595 ], [ %1598, %1597 ], !dbg !145 + %1599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1468 = icmp eq i32 %1599, 0, !dbg !145 + br i1 %.not.i1468, label %1602, label %1600, !dbg !145 + +1600: ; preds = %__nv_exp2f.exit1467 + %1601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1519) #3, !dbg !145 + br label %__nv_exp2f.exit1470, !dbg !145 + +1602: ; preds = %__nv_exp2f.exit1467 + %1603 = tail call float @llvm.nvvm.ex2.approx.f(float %1519) #3, !dbg !145 + br label %__nv_exp2f.exit1470, !dbg !145 + +__nv_exp2f.exit1470: ; preds = %1600, %1602 + %.0.i1469 = phi float [ %1601, %1600 ], [ %1603, %1602 ], !dbg !145 + %1604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1471 = icmp eq i32 %1604, 0, !dbg !145 + br i1 %.not.i1471, label %1607, label %1605, !dbg !145 + +1605: ; preds = %__nv_exp2f.exit1470 + %1606 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1520) #3, !dbg !145 + br label %__nv_exp2f.exit1473, !dbg !145 + +1607: ; preds = %__nv_exp2f.exit1470 + %1608 = tail call float @llvm.nvvm.ex2.approx.f(float %1520) #3, !dbg !145 + br label %__nv_exp2f.exit1473, !dbg !145 + +__nv_exp2f.exit1473: ; preds = %1605, %1607 + %.0.i1472 = phi float [ %1606, %1605 ], [ %1608, %1607 ], !dbg !145 + %1609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1474 = icmp eq i32 %1609, 0, !dbg !145 + br i1 %.not.i1474, label %1612, label %1610, !dbg !145 + +1610: ; preds = %__nv_exp2f.exit1473 + %1611 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1521) #3, !dbg !145 + br label %__nv_exp2f.exit1476, !dbg !145 + +1612: ; preds = %__nv_exp2f.exit1473 + %1613 = tail call float @llvm.nvvm.ex2.approx.f(float %1521) #3, !dbg !145 + br label %__nv_exp2f.exit1476, !dbg !145 + +__nv_exp2f.exit1476: ; preds = %1610, %1612 + %.0.i1475 = phi float [ %1611, %1610 ], [ %1613, %1612 ], !dbg !145 + %1614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1477 = icmp eq i32 %1614, 0, !dbg !145 + br i1 %.not.i1477, label %1617, label %1615, !dbg !145 + +1615: ; preds = %__nv_exp2f.exit1476 + %1616 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1522) #3, !dbg !145 + br label %__nv_exp2f.exit1479, !dbg !145 + +1617: ; preds = %__nv_exp2f.exit1476 + %1618 = tail call float @llvm.nvvm.ex2.approx.f(float %1522) #3, !dbg !145 + br label %__nv_exp2f.exit1479, !dbg !145 + +__nv_exp2f.exit1479: ; preds = %1615, %1617 + %.0.i1478 = phi float [ %1616, %1615 ], [ %1618, %1617 ], !dbg !145 + %1619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1480 = icmp eq i32 %1619, 0, !dbg !145 + br i1 %.not.i1480, label %1622, label %1620, !dbg !145 + +1620: ; preds = %__nv_exp2f.exit1479 + %1621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1523) #3, !dbg !145 + br label %__nv_exp2f.exit1482, !dbg !145 + +1622: ; preds = %__nv_exp2f.exit1479 + %1623 = tail call float @llvm.nvvm.ex2.approx.f(float %1523) #3, !dbg !145 + br label %__nv_exp2f.exit1482, !dbg !145 + +__nv_exp2f.exit1482: ; preds = %1620, %1622 + %.0.i1481 = phi float [ %1621, %1620 ], [ %1623, %1622 ], !dbg !145 + %1624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1483 = icmp eq i32 %1624, 0, !dbg !145 + br i1 %.not.i1483, label %1627, label %1625, !dbg !145 + +1625: ; preds = %__nv_exp2f.exit1482 + %1626 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1524) #3, !dbg !145 + br label %__nv_exp2f.exit1485, !dbg !145 + +1627: ; preds = %__nv_exp2f.exit1482 + %1628 = tail call float @llvm.nvvm.ex2.approx.f(float %1524) #3, !dbg !145 + br label %__nv_exp2f.exit1485, !dbg !145 + +__nv_exp2f.exit1485: ; preds = %1625, %1627 + %.0.i1484 = phi float [ %1626, %1625 ], [ %1628, %1627 ], !dbg !145 + %1629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1486 = icmp eq i32 %1629, 0, !dbg !145 + br i1 %.not.i1486, label %1632, label %1630, !dbg !145 + +1630: ; preds = %__nv_exp2f.exit1485 + %1631 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1525) #3, !dbg !145 + br label %__nv_exp2f.exit1488, !dbg !145 + +1632: ; preds = %__nv_exp2f.exit1485 + %1633 = tail call float @llvm.nvvm.ex2.approx.f(float %1525) #3, !dbg !145 + br label %__nv_exp2f.exit1488, !dbg !145 + +__nv_exp2f.exit1488: ; preds = %1630, %1632 + %.0.i1487 = phi float [ %1631, %1630 ], [ %1633, %1632 ], !dbg !145 + %1634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1489 = icmp eq i32 %1634, 0, !dbg !145 + br i1 %.not.i1489, label %1637, label %1635, !dbg !145 + +1635: ; preds = %__nv_exp2f.exit1488 + %1636 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1526) #3, !dbg !145 + br label %__nv_exp2f.exit1491, !dbg !145 + +1637: ; preds = %__nv_exp2f.exit1488 + %1638 = tail call float @llvm.nvvm.ex2.approx.f(float %1526) #3, !dbg !145 + br label %__nv_exp2f.exit1491, !dbg !145 + +__nv_exp2f.exit1491: ; preds = %1635, %1637 + %.0.i1490 = phi float [ %1636, %1635 ], [ %1638, %1637 ], !dbg !145 + %1639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1492 = icmp eq i32 %1639, 0, !dbg !145 + br i1 %.not.i1492, label %1642, label %1640, !dbg !145 + +1640: ; preds = %__nv_exp2f.exit1491 + %1641 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1527) #3, !dbg !145 + br label %__nv_exp2f.exit1494, !dbg !145 + +1642: ; preds = %__nv_exp2f.exit1491 + %1643 = tail call float @llvm.nvvm.ex2.approx.f(float %1527) #3, !dbg !145 + br label %__nv_exp2f.exit1494, !dbg !145 + +__nv_exp2f.exit1494: ; preds = %1640, %1642 + %.0.i1493 = phi float [ %1641, %1640 ], [ %1643, %1642 ], !dbg !145 + %1644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1495 = icmp eq i32 %1644, 0, !dbg !145 + br i1 %.not.i1495, label %1647, label %1645, !dbg !145 + +1645: ; preds = %__nv_exp2f.exit1494 + %1646 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1528) #3, !dbg !145 + br label %__nv_exp2f.exit1497, !dbg !145 + +1647: ; preds = %__nv_exp2f.exit1494 + %1648 = tail call float @llvm.nvvm.ex2.approx.f(float %1528) #3, !dbg !145 + br label %__nv_exp2f.exit1497, !dbg !145 + +__nv_exp2f.exit1497: ; preds = %1645, %1647 + %.0.i1496 = phi float [ %1646, %1645 ], [ %1648, %1647 ], !dbg !145 + %1649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1498 = icmp eq i32 %1649, 0, !dbg !145 + br i1 %.not.i1498, label %1652, label %1650, !dbg !145 + +1650: ; preds = %__nv_exp2f.exit1497 + %1651 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1529) #3, !dbg !145 + br label %__nv_exp2f.exit1500, !dbg !145 + +1652: ; preds = %__nv_exp2f.exit1497 + %1653 = tail call float @llvm.nvvm.ex2.approx.f(float %1529) #3, !dbg !145 + br label %__nv_exp2f.exit1500, !dbg !145 + +__nv_exp2f.exit1500: ; preds = %1650, %1652 + %.0.i1499 = phi float [ %1651, %1650 ], [ %1653, %1652 ], !dbg !145 + %1654 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1501 = icmp eq i32 %1654, 0, !dbg !145 + br i1 %.not.i1501, label %1657, label %1655, !dbg !145 + +1655: ; preds = %__nv_exp2f.exit1500 + %1656 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1530) #3, !dbg !145 + br label %__nv_exp2f.exit1503, !dbg !145 + +1657: ; preds = %__nv_exp2f.exit1500 + %1658 = tail call float @llvm.nvvm.ex2.approx.f(float %1530) #3, !dbg !145 + br label %__nv_exp2f.exit1503, !dbg !145 + +__nv_exp2f.exit1503: ; preds = %1655, %1657 + %.0.i1502 = phi float [ %1656, %1655 ], [ %1658, %1657 ], !dbg !145 + %1659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1504 = icmp eq i32 %1659, 0, !dbg !145 + br i1 %.not.i1504, label %1662, label %1660, !dbg !145 + +1660: ; preds = %__nv_exp2f.exit1503 + %1661 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1531) #3, !dbg !145 + br label %__nv_exp2f.exit1506, !dbg !145 + +1662: ; preds = %__nv_exp2f.exit1503 + %1663 = tail call float @llvm.nvvm.ex2.approx.f(float %1531) #3, !dbg !145 + br label %__nv_exp2f.exit1506, !dbg !145 + +__nv_exp2f.exit1506: ; preds = %1660, %1662 + %.0.i1505 = phi float [ %1661, %1660 ], [ %1663, %1662 ], !dbg !145 + %1664 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1507 = icmp eq i32 %1664, 0, !dbg !145 + br i1 %.not.i1507, label %1667, label %1665, !dbg !145 + +1665: ; preds = %__nv_exp2f.exit1506 + %1666 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1532) #3, !dbg !145 + br label %__nv_exp2f.exit1509, !dbg !145 + +1667: ; preds = %__nv_exp2f.exit1506 + %1668 = tail call float @llvm.nvvm.ex2.approx.f(float %1532) #3, !dbg !145 + br label %__nv_exp2f.exit1509, !dbg !145 + +__nv_exp2f.exit1509: ; preds = %1665, %1667 + %.0.i1508 = phi float [ %1666, %1665 ], [ %1668, %1667 ], !dbg !145 + %1669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1510 = icmp eq i32 %1669, 0, !dbg !145 + br i1 %.not.i1510, label %1672, label %1670, !dbg !145 + +1670: ; preds = %__nv_exp2f.exit1509 + %1671 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1533) #3, !dbg !145 + br label %__nv_exp2f.exit1512, !dbg !145 + +1672: ; preds = %__nv_exp2f.exit1509 + %1673 = tail call float @llvm.nvvm.ex2.approx.f(float %1533) #3, !dbg !145 + br label %__nv_exp2f.exit1512, !dbg !145 + +__nv_exp2f.exit1512: ; preds = %1670, %1672 + %.0.i1511 = phi float [ %1671, %1670 ], [ %1673, %1672 ], !dbg !145 + %1674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1513 = icmp eq i32 %1674, 0, !dbg !145 + br i1 %.not.i1513, label %1677, label %1675, !dbg !145 + +1675: ; preds = %__nv_exp2f.exit1512 + %1676 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1534) #3, !dbg !145 + br label %__nv_exp2f.exit1515, !dbg !145 + +1677: ; preds = %__nv_exp2f.exit1512 + %1678 = tail call float @llvm.nvvm.ex2.approx.f(float %1534) #3, !dbg !145 + br label %__nv_exp2f.exit1515, !dbg !145 + +__nv_exp2f.exit1515: ; preds = %1675, %1677 + %.0.i1514 = phi float [ %1676, %1675 ], [ %1678, %1677 ], !dbg !145 + %1679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1516 = icmp eq i32 %1679, 0, !dbg !145 + br i1 %.not.i1516, label %1682, label %1680, !dbg !145 + +1680: ; preds = %__nv_exp2f.exit1515 + %1681 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1535) #3, !dbg !145 + br label %__nv_exp2f.exit1518, !dbg !145 + +1682: ; preds = %__nv_exp2f.exit1515 + %1683 = tail call float @llvm.nvvm.ex2.approx.f(float %1535) #3, !dbg !145 + br label %__nv_exp2f.exit1518, !dbg !145 + +__nv_exp2f.exit1518: ; preds = %1680, %1682 + %.0.i1517 = phi float [ %1681, %1680 ], [ %1683, %1682 ], !dbg !145 + %1684 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1519 = icmp eq i32 %1684, 0, !dbg !145 + br i1 %.not.i1519, label %1687, label %1685, !dbg !145 + +1685: ; preds = %__nv_exp2f.exit1518 + %1686 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1536) #3, !dbg !145 + br label %__nv_exp2f.exit1521, !dbg !145 + +1687: ; preds = %__nv_exp2f.exit1518 + %1688 = tail call float @llvm.nvvm.ex2.approx.f(float %1536) #3, !dbg !145 + br label %__nv_exp2f.exit1521, !dbg !145 + +__nv_exp2f.exit1521: ; preds = %1685, %1687 + %.0.i1520 = phi float [ %1686, %1685 ], [ %1688, %1687 ], !dbg !145 + %1689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1522 = icmp eq i32 %1689, 0, !dbg !145 + br i1 %.not.i1522, label %1692, label %1690, !dbg !145 + +1690: ; preds = %__nv_exp2f.exit1521 + %1691 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1537) #3, !dbg !145 + br label %__nv_exp2f.exit1524, !dbg !145 + +1692: ; preds = %__nv_exp2f.exit1521 + %1693 = tail call float @llvm.nvvm.ex2.approx.f(float %1537) #3, !dbg !145 + br label %__nv_exp2f.exit1524, !dbg !145 + +__nv_exp2f.exit1524: ; preds = %1690, %1692 + %.0.i1523 = phi float [ %1691, %1690 ], [ %1693, %1692 ], !dbg !145 + %1694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1525 = icmp eq i32 %1694, 0, !dbg !145 + br i1 %.not.i1525, label %1697, label %1695, !dbg !145 + +1695: ; preds = %__nv_exp2f.exit1524 + %1696 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1538) #3, !dbg !145 + br label %__nv_exp2f.exit1527, !dbg !145 + +1697: ; preds = %__nv_exp2f.exit1524 + %1698 = tail call float @llvm.nvvm.ex2.approx.f(float %1538) #3, !dbg !145 + br label %__nv_exp2f.exit1527, !dbg !145 + +__nv_exp2f.exit1527: ; preds = %1695, %1697 + %.0.i1526 = phi float [ %1696, %1695 ], [ %1698, %1697 ], !dbg !145 + %1699 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %630, !dbg !113 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !146 + %1700 = add i32 %634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1701 = lshr exact i32 %1700, 4, !dbg !146 + %1702 = and i32 %1701, 16383, !dbg !146 + %1703 = zext nneg i32 %1702 to i64, !dbg !146 + %1704 = or disjoint i64 %1703, 4611686293372403712, !dbg !146 + %1705 = ptrtoint ptr addrspace(3) %1699 to i32, !dbg !146 + %1706 = lshr exact i32 %1705, 4, !dbg !146 + %1707 = and i32 %1706, 16383, !dbg !146 + %1708 = zext nneg i32 %1707 to i64, !dbg !146 + %1709 = or disjoint i64 %1708, 4611686293338849280, !dbg !146 + %1710 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %1704, i64 %1709) #3, !dbg !146 + %1711 = add i32 %646, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1712 = lshr exact i32 %1711, 4, !dbg !146 + %1713 = and i32 %1712, 16383, !dbg !146 + %1714 = zext nneg i32 %1713 to i64, !dbg !146 + %1715 = or disjoint i64 %1714, 4611686293372403712, !dbg !146 + %1716 = add i32 %1705, 32, !dbg !146 + %1717 = lshr exact i32 %1716, 4, !dbg !146 + %1718 = and i32 %1717, 16383, !dbg !146 + %1719 = zext nneg i32 %1718 to i64, !dbg !146 + %1720 = or disjoint i64 %1719, 4611686293338849280, !dbg !146 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 0, !dbg !146 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 1, !dbg !146 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 2, !dbg !146 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 3, !dbg !146 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 4, !dbg !146 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 5, !dbg !146 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 6, !dbg !146 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 7, !dbg !146 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 8, !dbg !146 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 9, !dbg !146 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 10, !dbg !146 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 11, !dbg !146 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 12, !dbg !146 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 13, !dbg !146 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 14, !dbg !146 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 15, !dbg !146 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 16, !dbg !146 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 17, !dbg !146 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 18, !dbg !146 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 19, !dbg !146 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 20, !dbg !146 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 21, !dbg !146 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 22, !dbg !146 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 23, !dbg !146 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 24, !dbg !146 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 25, !dbg !146 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 26, !dbg !146 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 27, !dbg !146 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 28, !dbg !146 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 29, !dbg !146 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 30, !dbg !146 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 31, !dbg !146 + %1753 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, float %1750, float %1751, float %1752, i64 %1715, i64 %1720, i1 true) #3, !dbg !146 + %1754 = add i32 %690, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1755 = lshr exact i32 %1754, 4, !dbg !146 + %1756 = and i32 %1755, 16383, !dbg !146 + %1757 = zext nneg i32 %1756 to i64, !dbg !146 + %1758 = or disjoint i64 %1757, 4611686293372403712, !dbg !146 + %1759 = add i32 %1705, 64, !dbg !146 + %1760 = lshr exact i32 %1759, 4, !dbg !146 + %1761 = and i32 %1760, 16383, !dbg !146 + %1762 = zext nneg i32 %1761 to i64, !dbg !146 + %1763 = or disjoint i64 %1762, 4611686293338849280, !dbg !146 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 0, !dbg !146 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 1, !dbg !146 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 2, !dbg !146 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 3, !dbg !146 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 4, !dbg !146 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 5, !dbg !146 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 6, !dbg !146 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 7, !dbg !146 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 8, !dbg !146 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 9, !dbg !146 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 10, !dbg !146 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 11, !dbg !146 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 12, !dbg !146 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 13, !dbg !146 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 14, !dbg !146 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 15, !dbg !146 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 16, !dbg !146 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 17, !dbg !146 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 18, !dbg !146 + %1783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 19, !dbg !146 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 20, !dbg !146 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 21, !dbg !146 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 22, !dbg !146 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 23, !dbg !146 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 24, !dbg !146 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 25, !dbg !146 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 26, !dbg !146 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 27, !dbg !146 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 28, !dbg !146 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 29, !dbg !146 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 30, !dbg !146 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 31, !dbg !146 + %1796 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, float %1783, float %1784, float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, i64 %1758, i64 %1763, i1 true) #3, !dbg !146 + %1797 = add i32 %734, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1798 = lshr exact i32 %1797, 4, !dbg !146 + %1799 = and i32 %1798, 16383, !dbg !146 + %1800 = zext nneg i32 %1799 to i64, !dbg !146 + %1801 = or disjoint i64 %1800, 4611686293372403712, !dbg !146 + %1802 = add i32 %1705, 96, !dbg !146 + %1803 = lshr exact i32 %1802, 4, !dbg !146 + %1804 = and i32 %1803, 16383, !dbg !146 + %1805 = zext nneg i32 %1804 to i64, !dbg !146 + %1806 = or disjoint i64 %1805, 4611686293338849280, !dbg !146 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 0, !dbg !146 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 1, !dbg !146 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 2, !dbg !146 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 3, !dbg !146 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 4, !dbg !146 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 5, !dbg !146 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 6, !dbg !146 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 7, !dbg !146 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 8, !dbg !146 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 9, !dbg !146 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 10, !dbg !146 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 11, !dbg !146 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 12, !dbg !146 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 13, !dbg !146 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 14, !dbg !146 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 15, !dbg !146 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 16, !dbg !146 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 17, !dbg !146 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 18, !dbg !146 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 19, !dbg !146 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 20, !dbg !146 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 21, !dbg !146 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 22, !dbg !146 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 23, !dbg !146 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 24, !dbg !146 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 25, !dbg !146 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 26, !dbg !146 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 27, !dbg !146 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 28, !dbg !146 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 29, !dbg !146 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 30, !dbg !146 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 31, !dbg !146 + %1839 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1807, float %1808, float %1809, float %1810, float %1811, float %1812, float %1813, float %1814, float %1815, float %1816, float %1817, float %1818, float %1819, float %1820, float %1821, float %1822, float %1823, float %1824, float %1825, float %1826, float %1827, float %1828, float %1829, float %1830, float %1831, float %1832, float %1833, float %1834, float %1835, float %1836, float %1837, float %1838, i64 %1801, i64 %1806, i1 true) #3, !dbg !146 + %1840 = add i32 %778, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1841 = lshr exact i32 %1840, 4, !dbg !146 + %1842 = and i32 %1841, 16383, !dbg !146 + %1843 = zext nneg i32 %1842 to i64, !dbg !146 + %1844 = or disjoint i64 %1843, 4611686293372403712, !dbg !146 + %1845 = add i32 %1705, 8192, !dbg !146 + %1846 = lshr exact i32 %1845, 4, !dbg !146 + %1847 = and i32 %1846, 16383, !dbg !146 + %1848 = zext nneg i32 %1847 to i64, !dbg !146 + %1849 = or disjoint i64 %1848, 4611686293338849280, !dbg !146 + %1850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 0, !dbg !146 + %1851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 1, !dbg !146 + %1852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 2, !dbg !146 + %1853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 3, !dbg !146 + %1854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 4, !dbg !146 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 5, !dbg !146 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 6, !dbg !146 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 7, !dbg !146 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 8, !dbg !146 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 9, !dbg !146 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 10, !dbg !146 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 11, !dbg !146 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 12, !dbg !146 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 13, !dbg !146 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 14, !dbg !146 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 15, !dbg !146 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 16, !dbg !146 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 17, !dbg !146 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 18, !dbg !146 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 19, !dbg !146 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 20, !dbg !146 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 21, !dbg !146 + %1872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 22, !dbg !146 + %1873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 23, !dbg !146 + %1874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 24, !dbg !146 + %1875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 25, !dbg !146 + %1876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 26, !dbg !146 + %1877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 27, !dbg !146 + %1878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 28, !dbg !146 + %1879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 29, !dbg !146 + %1880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 30, !dbg !146 + %1881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 31, !dbg !146 + %1882 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1850, float %1851, float %1852, float %1853, float %1854, float %1855, float %1856, float %1857, float %1858, float %1859, float %1860, float %1861, float %1862, float %1863, float %1864, float %1865, float %1866, float %1867, float %1868, float %1869, float %1870, float %1871, float %1872, float %1873, float %1874, float %1875, float %1876, float %1877, float %1878, float %1879, float %1880, float %1881, i64 %1844, i64 %1849, i1 true) #3, !dbg !146 + %1883 = add i32 %822, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1884 = lshr exact i32 %1883, 4, !dbg !146 + %1885 = and i32 %1884, 16383, !dbg !146 + %1886 = zext nneg i32 %1885 to i64, !dbg !146 + %1887 = or disjoint i64 %1886, 4611686293372403712, !dbg !146 + %1888 = add i32 %1705, 8224, !dbg !146 + %1889 = lshr exact i32 %1888, 4, !dbg !146 + %1890 = and i32 %1889, 16383, !dbg !146 + %1891 = zext nneg i32 %1890 to i64, !dbg !146 + %1892 = or disjoint i64 %1891, 4611686293338849280, !dbg !146 + %1893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 0, !dbg !146 + %1894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 1, !dbg !146 + %1895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 2, !dbg !146 + %1896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 3, !dbg !146 + %1897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 4, !dbg !146 + %1898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 5, !dbg !146 + %1899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 6, !dbg !146 + %1900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 7, !dbg !146 + %1901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 8, !dbg !146 + %1902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 9, !dbg !146 + %1903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 10, !dbg !146 + %1904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 11, !dbg !146 + %1905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 12, !dbg !146 + %1906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 13, !dbg !146 + %1907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 14, !dbg !146 + %1908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 15, !dbg !146 + %1909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 16, !dbg !146 + %1910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 17, !dbg !146 + %1911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 18, !dbg !146 + %1912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 19, !dbg !146 + %1913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 20, !dbg !146 + %1914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 21, !dbg !146 + %1915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 22, !dbg !146 + %1916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 23, !dbg !146 + %1917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 24, !dbg !146 + %1918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 25, !dbg !146 + %1919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 26, !dbg !146 + %1920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 27, !dbg !146 + %1921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 28, !dbg !146 + %1922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 29, !dbg !146 + %1923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 30, !dbg !146 + %1924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 31, !dbg !146 + %1925 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1893, float %1894, float %1895, float %1896, float %1897, float %1898, float %1899, float %1900, float %1901, float %1902, float %1903, float %1904, float %1905, float %1906, float %1907, float %1908, float %1909, float %1910, float %1911, float %1912, float %1913, float %1914, float %1915, float %1916, float %1917, float %1918, float %1919, float %1920, float %1921, float %1922, float %1923, float %1924, i64 %1887, i64 %1892, i1 true) #3, !dbg !146 + %1926 = add i32 %866, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1927 = lshr exact i32 %1926, 4, !dbg !146 + %1928 = and i32 %1927, 16383, !dbg !146 + %1929 = zext nneg i32 %1928 to i64, !dbg !146 + %1930 = or disjoint i64 %1929, 4611686293372403712, !dbg !146 + %1931 = add i32 %1705, 8256, !dbg !146 + %1932 = lshr exact i32 %1931, 4, !dbg !146 + %1933 = and i32 %1932, 16383, !dbg !146 + %1934 = zext nneg i32 %1933 to i64, !dbg !146 + %1935 = or disjoint i64 %1934, 4611686293338849280, !dbg !146 + %1936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 0, !dbg !146 + %1937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 1, !dbg !146 + %1938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 2, !dbg !146 + %1939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 3, !dbg !146 + %1940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 4, !dbg !146 + %1941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 5, !dbg !146 + %1942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 6, !dbg !146 + %1943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 7, !dbg !146 + %1944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 8, !dbg !146 + %1945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 9, !dbg !146 + %1946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 10, !dbg !146 + %1947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 11, !dbg !146 + %1948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 12, !dbg !146 + %1949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 13, !dbg !146 + %1950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 14, !dbg !146 + %1951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 15, !dbg !146 + %1952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 16, !dbg !146 + %1953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 17, !dbg !146 + %1954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 18, !dbg !146 + %1955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 19, !dbg !146 + %1956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 20, !dbg !146 + %1957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 21, !dbg !146 + %1958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 22, !dbg !146 + %1959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 23, !dbg !146 + %1960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 24, !dbg !146 + %1961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 25, !dbg !146 + %1962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 26, !dbg !146 + %1963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 27, !dbg !146 + %1964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 28, !dbg !146 + %1965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 29, !dbg !146 + %1966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 30, !dbg !146 + %1967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 31, !dbg !146 + %1968 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1936, float %1937, float %1938, float %1939, float %1940, float %1941, float %1942, float %1943, float %1944, float %1945, float %1946, float %1947, float %1948, float %1949, float %1950, float %1951, float %1952, float %1953, float %1954, float %1955, float %1956, float %1957, float %1958, float %1959, float %1960, float %1961, float %1962, float %1963, float %1964, float %1965, float %1966, float %1967, i64 %1930, i64 %1935, i1 true) #3, !dbg !146 + %1969 = add i32 %910, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1970 = lshr exact i32 %1969, 4, !dbg !146 + %1971 = and i32 %1970, 16383, !dbg !146 + %1972 = zext nneg i32 %1971 to i64, !dbg !146 + %1973 = or disjoint i64 %1972, 4611686293372403712, !dbg !146 + %1974 = add i32 %1705, 8288, !dbg !146 + %1975 = lshr exact i32 %1974, 4, !dbg !146 + %1976 = and i32 %1975, 16383, !dbg !146 + %1977 = zext nneg i32 %1976 to i64, !dbg !146 + %1978 = or disjoint i64 %1977, 4611686293338849280, !dbg !146 + %1979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 0, !dbg !146 + %1980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 1, !dbg !146 + %1981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 2, !dbg !146 + %1982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 3, !dbg !146 + %1983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 4, !dbg !146 + %1984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 5, !dbg !146 + %1985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 6, !dbg !146 + %1986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 7, !dbg !146 + %1987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 8, !dbg !146 + %1988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 9, !dbg !146 + %1989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 10, !dbg !146 + %1990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 11, !dbg !146 + %1991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 12, !dbg !146 + %1992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 13, !dbg !146 + %1993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 14, !dbg !146 + %1994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 15, !dbg !146 + %1995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 16, !dbg !146 + %1996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 17, !dbg !146 + %1997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 18, !dbg !146 + %1998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 19, !dbg !146 + %1999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 20, !dbg !146 + %2000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 21, !dbg !146 + %2001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 22, !dbg !146 + %2002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 23, !dbg !146 + %2003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 24, !dbg !146 + %2004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 25, !dbg !146 + %2005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 26, !dbg !146 + %2006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 27, !dbg !146 + %2007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 28, !dbg !146 + %2008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 29, !dbg !146 + %2009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 30, !dbg !146 + %2010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 31, !dbg !146 + %2011 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1979, float %1980, float %1981, float %1982, float %1983, float %1984, float %1985, float %1986, float %1987, float %1988, float %1989, float %1990, float %1991, float %1992, float %1993, float %1994, float %1995, float %1996, float %1997, float %1998, float %1999, float %2000, float %2001, float %2002, float %2003, float %2004, float %2005, float %2006, float %2007, float %2008, float %2009, float %2010, i64 %1973, i64 %1978, i1 true) #3, !dbg !146 + %2012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 0, !dbg !146 + %2013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 1, !dbg !146 + %2014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 2, !dbg !146 + %2015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 3, !dbg !146 + %2016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 4, !dbg !146 + %2017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 5, !dbg !146 + %2018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 6, !dbg !146 + %2019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 7, !dbg !146 + %2020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 8, !dbg !146 + %2021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 9, !dbg !146 + %2022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 10, !dbg !146 + %2023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 11, !dbg !146 + %2024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 12, !dbg !146 + %2025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 13, !dbg !146 + %2026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 14, !dbg !146 + %2027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 15, !dbg !146 + %2028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 16, !dbg !146 + %2029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 17, !dbg !146 + %2030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 18, !dbg !146 + %2031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 19, !dbg !146 + %2032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 20, !dbg !146 + %2033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 21, !dbg !146 + %2034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 22, !dbg !146 + %2035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 23, !dbg !146 + %2036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 24, !dbg !146 + %2037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 25, !dbg !146 + %2038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 26, !dbg !146 + %2039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 27, !dbg !146 + %2040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 28, !dbg !146 + %2041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 29, !dbg !146 + %2042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 30, !dbg !146 + %2043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 31, !dbg !146 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !146 + %2044 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %2012, float %2013, float %2014, float %2015, float %2016, float %2017, float %2018, float %2019, float %2020, float %2021, float %2022, float %2023, float %2024, float %2025, float %2026, float %2027, float %2028, float %2029, float %2030, float %2031, float %2032, float %2033, float %2034, float %2035, float %2036, float %2037, float %2038, float %2039, float %2040, float %2041, float %2042, float %2043, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %1699, i32 0, i32 0) #3, !dbg !146 + %2045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 0, !dbg !146 + %2046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 1, !dbg !146 + %2047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 2, !dbg !146 + %2048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 3, !dbg !146 + %2049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 4, !dbg !146 + %2050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 5, !dbg !146 + %2051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 6, !dbg !146 + %2052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 7, !dbg !146 + %2053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 8, !dbg !146 + %2054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 9, !dbg !146 + %2055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 10, !dbg !146 + %2056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 11, !dbg !146 + %2057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 12, !dbg !146 + %2058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 13, !dbg !146 + %2059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 14, !dbg !146 + %2060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 15, !dbg !146 + %2061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 16, !dbg !146 + %2062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 17, !dbg !146 + %2063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 18, !dbg !146 + %2064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 19, !dbg !146 + %2065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 20, !dbg !146 + %2066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 21, !dbg !146 + %2067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 22, !dbg !146 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 23, !dbg !146 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 24, !dbg !146 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 25, !dbg !146 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 26, !dbg !146 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 27, !dbg !146 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 28, !dbg !146 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 29, !dbg !146 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 30, !dbg !146 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 31, !dbg !146 + %2077 = insertelement <2 x float> poison, float %2045, i64 0, !dbg !125 + %2078 = insertelement <2 x float> %2077, float %2046, i64 1, !dbg !125 + %2079 = fsub <2 x float> %2078, %539, !dbg !125 + %2080 = insertelement <2 x float> poison, float %.0.i1433, i64 0, !dbg !147 + %2081 = insertelement <2 x float> %2080, float %.0.i1436, i64 1, !dbg !147 + %2082 = fmul <2 x float> %2081, %2079, !dbg !147 + %2083 = fptrunc <2 x float> %2082 to <2 x bfloat>, !dbg !148 + %2084 = select <2 x i1> %1094, <2 x bfloat> %2083, <2 x bfloat> zeroinitializer, !dbg !149 + %2085 = insertelement <2 x float> poison, float %2047, i64 0, !dbg !125 + %2086 = insertelement <2 x float> %2085, float %2048, i64 1, !dbg !125 + %2087 = fsub <2 x float> %2086, %533, !dbg !125 + %2088 = insertelement <2 x float> poison, float %.0.i1439, i64 0, !dbg !147 + %2089 = insertelement <2 x float> %2088, float %.0.i1442, i64 1, !dbg !147 + %2090 = fmul <2 x float> %2089, %2087, !dbg !147 + %2091 = fptrunc <2 x float> %2090 to <2 x bfloat>, !dbg !148 + %2092 = select <2 x i1> %1095, <2 x bfloat> %2091, <2 x bfloat> zeroinitializer, !dbg !149 + %2093 = insertelement <2 x float> poison, float %2049, i64 0, !dbg !125 + %2094 = insertelement <2 x float> %2093, float %2050, i64 1, !dbg !125 + %2095 = fsub <2 x float> %2094, %539, !dbg !125 + %2096 = insertelement <2 x float> poison, float %.0.i1445, i64 0, !dbg !147 + %2097 = insertelement <2 x float> %2096, float %.0.i1448, i64 1, !dbg !147 + %2098 = fmul <2 x float> %2097, %2095, !dbg !147 + %2099 = fptrunc <2 x float> %2098 to <2 x bfloat>, !dbg !148 + %2100 = select <2 x i1> %1139, <2 x bfloat> %2099, <2 x bfloat> zeroinitializer, !dbg !149 + %2101 = insertelement <2 x float> poison, float %2051, i64 0, !dbg !125 + %2102 = insertelement <2 x float> %2101, float %2052, i64 1, !dbg !125 + %2103 = fsub <2 x float> %2102, %533, !dbg !125 + %2104 = insertelement <2 x float> poison, float %.0.i1451, i64 0, !dbg !147 + %2105 = insertelement <2 x float> %2104, float %.0.i1454, i64 1, !dbg !147 + %2106 = fmul <2 x float> %2105, %2103, !dbg !147 + %2107 = fptrunc <2 x float> %2106 to <2 x bfloat>, !dbg !148 + %2108 = select <2 x i1> %1140, <2 x bfloat> %2107, <2 x bfloat> zeroinitializer, !dbg !149 + %2109 = insertelement <2 x float> poison, float %2053, i64 0, !dbg !125 + %2110 = insertelement <2 x float> %2109, float %2054, i64 1, !dbg !125 + %2111 = fsub <2 x float> %2110, %539, !dbg !125 + %2112 = insertelement <2 x float> poison, float %.0.i1457, i64 0, !dbg !147 + %2113 = insertelement <2 x float> %2112, float %.0.i1460, i64 1, !dbg !147 + %2114 = fmul <2 x float> %2113, %2111, !dbg !147 + %2115 = fptrunc <2 x float> %2114 to <2 x bfloat>, !dbg !148 + %2116 = select <2 x i1> %1184, <2 x bfloat> %2115, <2 x bfloat> zeroinitializer, !dbg !149 + %2117 = insertelement <2 x float> poison, float %2055, i64 0, !dbg !125 + %2118 = insertelement <2 x float> %2117, float %2056, i64 1, !dbg !125 + %2119 = fsub <2 x float> %2118, %533, !dbg !125 + %2120 = insertelement <2 x float> poison, float %.0.i1463, i64 0, !dbg !147 + %2121 = insertelement <2 x float> %2120, float %.0.i1466, i64 1, !dbg !147 + %2122 = fmul <2 x float> %2121, %2119, !dbg !147 + %2123 = fptrunc <2 x float> %2122 to <2 x bfloat>, !dbg !148 + %2124 = select <2 x i1> %1185, <2 x bfloat> %2123, <2 x bfloat> zeroinitializer, !dbg !149 + %2125 = insertelement <2 x float> poison, float %2057, i64 0, !dbg !125 + %2126 = insertelement <2 x float> %2125, float %2058, i64 1, !dbg !125 + %2127 = fsub <2 x float> %2126, %539, !dbg !125 + %2128 = insertelement <2 x float> poison, float %.0.i1469, i64 0, !dbg !147 + %2129 = insertelement <2 x float> %2128, float %.0.i1472, i64 1, !dbg !147 + %2130 = fmul <2 x float> %2129, %2127, !dbg !147 + %2131 = fptrunc <2 x float> %2130 to <2 x bfloat>, !dbg !148 + %2132 = select <2 x i1> %1229, <2 x bfloat> %2131, <2 x bfloat> zeroinitializer, !dbg !149 + %2133 = insertelement <2 x float> poison, float %2059, i64 0, !dbg !125 + %2134 = insertelement <2 x float> %2133, float %2060, i64 1, !dbg !125 + %2135 = fsub <2 x float> %2134, %533, !dbg !125 + %2136 = insertelement <2 x float> poison, float %.0.i1475, i64 0, !dbg !147 + %2137 = insertelement <2 x float> %2136, float %.0.i1478, i64 1, !dbg !147 + %2138 = fmul <2 x float> %2137, %2135, !dbg !147 + %2139 = fptrunc <2 x float> %2138 to <2 x bfloat>, !dbg !148 + %2140 = select <2 x i1> %1230, <2 x bfloat> %2139, <2 x bfloat> zeroinitializer, !dbg !149 + %2141 = insertelement <2 x float> poison, float %2061, i64 0, !dbg !125 + %2142 = insertelement <2 x float> %2141, float %2062, i64 1, !dbg !125 + %2143 = fsub <2 x float> %2142, %539, !dbg !125 + %2144 = insertelement <2 x float> poison, float %.0.i1481, i64 0, !dbg !147 + %2145 = insertelement <2 x float> %2144, float %.0.i1484, i64 1, !dbg !147 + %2146 = fmul <2 x float> %2145, %2143, !dbg !147 + %2147 = fptrunc <2 x float> %2146 to <2 x bfloat>, !dbg !148 + %2148 = select <2 x i1> %1274, <2 x bfloat> %2147, <2 x bfloat> zeroinitializer, !dbg !149 + %2149 = insertelement <2 x float> poison, float %2063, i64 0, !dbg !125 + %2150 = insertelement <2 x float> %2149, float %2064, i64 1, !dbg !125 + %2151 = fsub <2 x float> %2150, %533, !dbg !125 + %2152 = insertelement <2 x float> poison, float %.0.i1487, i64 0, !dbg !147 + %2153 = insertelement <2 x float> %2152, float %.0.i1490, i64 1, !dbg !147 + %2154 = fmul <2 x float> %2153, %2151, !dbg !147 + %2155 = fptrunc <2 x float> %2154 to <2 x bfloat>, !dbg !148 + %2156 = select <2 x i1> %1275, <2 x bfloat> %2155, <2 x bfloat> zeroinitializer, !dbg !149 + %2157 = insertelement <2 x float> poison, float %2065, i64 0, !dbg !125 + %2158 = insertelement <2 x float> %2157, float %2066, i64 1, !dbg !125 + %2159 = fsub <2 x float> %2158, %539, !dbg !125 + %2160 = insertelement <2 x float> poison, float %.0.i1493, i64 0, !dbg !147 + %2161 = insertelement <2 x float> %2160, float %.0.i1496, i64 1, !dbg !147 + %2162 = fmul <2 x float> %2161, %2159, !dbg !147 + %2163 = fptrunc <2 x float> %2162 to <2 x bfloat>, !dbg !148 + %2164 = select <2 x i1> %1319, <2 x bfloat> %2163, <2 x bfloat> zeroinitializer, !dbg !149 + %2165 = insertelement <2 x float> poison, float %2067, i64 0, !dbg !125 + %2166 = insertelement <2 x float> %2165, float %2068, i64 1, !dbg !125 + %2167 = fsub <2 x float> %2166, %533, !dbg !125 + %2168 = insertelement <2 x float> poison, float %.0.i1499, i64 0, !dbg !147 + %2169 = insertelement <2 x float> %2168, float %.0.i1502, i64 1, !dbg !147 + %2170 = fmul <2 x float> %2169, %2167, !dbg !147 + %2171 = fptrunc <2 x float> %2170 to <2 x bfloat>, !dbg !148 + %2172 = select <2 x i1> %1320, <2 x bfloat> %2171, <2 x bfloat> zeroinitializer, !dbg !149 + %2173 = insertelement <2 x float> poison, float %2069, i64 0, !dbg !125 + %2174 = insertelement <2 x float> %2173, float %2070, i64 1, !dbg !125 + %2175 = fsub <2 x float> %2174, %539, !dbg !125 + %2176 = insertelement <2 x float> poison, float %.0.i1505, i64 0, !dbg !147 + %2177 = insertelement <2 x float> %2176, float %.0.i1508, i64 1, !dbg !147 + %2178 = fmul <2 x float> %2177, %2175, !dbg !147 + %2179 = fptrunc <2 x float> %2178 to <2 x bfloat>, !dbg !148 + %2180 = select <2 x i1> %1364, <2 x bfloat> %2179, <2 x bfloat> zeroinitializer, !dbg !149 + %2181 = insertelement <2 x float> poison, float %2071, i64 0, !dbg !125 + %2182 = insertelement <2 x float> %2181, float %2072, i64 1, !dbg !125 + %2183 = fsub <2 x float> %2182, %533, !dbg !125 + %2184 = insertelement <2 x float> poison, float %.0.i1511, i64 0, !dbg !147 + %2185 = insertelement <2 x float> %2184, float %.0.i1514, i64 1, !dbg !147 + %2186 = fmul <2 x float> %2185, %2183, !dbg !147 + %2187 = fptrunc <2 x float> %2186 to <2 x bfloat>, !dbg !148 + %2188 = select <2 x i1> %1365, <2 x bfloat> %2187, <2 x bfloat> zeroinitializer, !dbg !149 + %2189 = insertelement <2 x float> poison, float %2073, i64 0, !dbg !125 + %2190 = insertelement <2 x float> %2189, float %2074, i64 1, !dbg !125 + %2191 = fsub <2 x float> %2190, %539, !dbg !125 + %2192 = insertelement <2 x float> poison, float %.0.i1517, i64 0, !dbg !147 + %2193 = insertelement <2 x float> %2192, float %.0.i1520, i64 1, !dbg !147 + %2194 = fmul <2 x float> %2193, %2191, !dbg !147 + %2195 = fptrunc <2 x float> %2194 to <2 x bfloat>, !dbg !148 + %2196 = select <2 x i1> %1409, <2 x bfloat> %2195, <2 x bfloat> zeroinitializer, !dbg !149 + %2197 = insertelement <2 x float> poison, float %2075, i64 0, !dbg !125 + %2198 = insertelement <2 x float> %2197, float %2076, i64 1, !dbg !125 + %2199 = fsub <2 x float> %2198, %533, !dbg !125 + %2200 = insertelement <2 x float> poison, float %.0.i1523, i64 0, !dbg !147 + %2201 = insertelement <2 x float> %2200, float %.0.i1526, i64 1, !dbg !147 + %2202 = fmul <2 x float> %2201, %2199, !dbg !147 + %2203 = fptrunc <2 x float> %2202 to <2 x bfloat>, !dbg !148 + %2204 = select <2 x i1> %1410, <2 x bfloat> %2203, <2 x bfloat> zeroinitializer, !dbg !149 + %2205 = bitcast <2 x bfloat> %2084 to i32, !dbg !150 + %2206 = bitcast <2 x bfloat> %2092 to i32, !dbg !150 + %2207 = bitcast <2 x bfloat> %2100 to i32, !dbg !150 + %2208 = bitcast <2 x bfloat> %2108 to i32, !dbg !150 + %2209 = bitcast <2 x bfloat> %2116 to i32, !dbg !150 + %2210 = bitcast <2 x bfloat> %2124 to i32, !dbg !150 + %2211 = bitcast <2 x bfloat> %2132 to i32, !dbg !150 + %2212 = bitcast <2 x bfloat> %2140 to i32, !dbg !150 + %2213 = bitcast <2 x bfloat> %2148 to i32, !dbg !150 + %2214 = bitcast <2 x bfloat> %2156 to i32, !dbg !150 + %2215 = bitcast <2 x bfloat> %2164 to i32, !dbg !150 + %2216 = bitcast <2 x bfloat> %2172 to i32, !dbg !150 + %2217 = bitcast <2 x bfloat> %2180 to i32, !dbg !150 + %2218 = bitcast <2 x bfloat> %2188 to i32, !dbg !150 + %2219 = bitcast <2 x bfloat> %2196 to i32, !dbg !150 + %2220 = bitcast <2 x bfloat> %2204 to i32, !dbg !150 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !150 + %2221 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, float %569, float %570, float %571, float %572, float %573, float %574, float %575, float %576, float %577, float %578, float %579, float %580, float %581, float %582, float %583, float %584, float %585, float %586, float %587, float %588, float %589, float %590, float %591, float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, i32 %2205, i32 %2206, i32 %2207, i32 %2208, i64 %644, i1 true) #3, !dbg !150 + %2222 = add i32 %640, 2048, !dbg !150 + %2223 = lshr exact i32 %2222, 4, !dbg !150 + %2224 = and i32 %2223, 16383, !dbg !150 + %2225 = zext nneg i32 %2224 to i64, !dbg !150 + %2226 = or disjoint i64 %2225, 4611686293338849280, !dbg !150 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 0, !dbg !150 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 1, !dbg !150 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 2, !dbg !150 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 3, !dbg !150 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 4, !dbg !150 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 5, !dbg !150 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 6, !dbg !150 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 7, !dbg !150 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 8, !dbg !150 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 9, !dbg !150 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 10, !dbg !150 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 11, !dbg !150 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 12, !dbg !150 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 13, !dbg !150 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 14, !dbg !150 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 15, !dbg !150 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 16, !dbg !150 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 17, !dbg !150 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 18, !dbg !150 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 19, !dbg !150 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 20, !dbg !150 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 21, !dbg !150 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 22, !dbg !150 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 23, !dbg !150 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 24, !dbg !150 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 25, !dbg !150 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 26, !dbg !150 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 27, !dbg !150 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 28, !dbg !150 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 29, !dbg !150 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 30, !dbg !150 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 31, !dbg !150 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 32, !dbg !150 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 33, !dbg !150 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 34, !dbg !150 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 35, !dbg !150 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 36, !dbg !150 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 37, !dbg !150 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 38, !dbg !150 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 39, !dbg !150 + %2267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 40, !dbg !150 + %2268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 41, !dbg !150 + %2269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 42, !dbg !150 + %2270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 43, !dbg !150 + %2271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 44, !dbg !150 + %2272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 45, !dbg !150 + %2273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 46, !dbg !150 + %2274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 47, !dbg !150 + %2275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 48, !dbg !150 + %2276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 49, !dbg !150 + %2277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 50, !dbg !150 + %2278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 51, !dbg !150 + %2279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 52, !dbg !150 + %2280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 53, !dbg !150 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 54, !dbg !150 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 55, !dbg !150 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 56, !dbg !150 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 57, !dbg !150 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 58, !dbg !150 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 59, !dbg !150 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 60, !dbg !150 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 61, !dbg !150 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 62, !dbg !150 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 63, !dbg !150 + %2291 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2227, float %2228, float %2229, float %2230, float %2231, float %2232, float %2233, float %2234, float %2235, float %2236, float %2237, float %2238, float %2239, float %2240, float %2241, float %2242, float %2243, float %2244, float %2245, float %2246, float %2247, float %2248, float %2249, float %2250, float %2251, float %2252, float %2253, float %2254, float %2255, float %2256, float %2257, float %2258, float %2259, float %2260, float %2261, float %2262, float %2263, float %2264, float %2265, float %2266, float %2267, float %2268, float %2269, float %2270, float %2271, float %2272, float %2273, float %2274, float %2275, float %2276, float %2277, float %2278, float %2279, float %2280, float %2281, float %2282, float %2283, float %2284, float %2285, float %2286, float %2287, float %2288, float %2289, float %2290, i32 %2209, i32 %2210, i32 %2211, i32 %2212, i64 %2226, i1 true) #3, !dbg !150 + %2292 = add i32 %640, 4096, !dbg !150 + %2293 = lshr exact i32 %2292, 4, !dbg !150 + %2294 = and i32 %2293, 16383, !dbg !150 + %2295 = zext nneg i32 %2294 to i64, !dbg !150 + %2296 = or disjoint i64 %2295, 4611686293338849280, !dbg !150 + %2297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 0, !dbg !150 + %2298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 1, !dbg !150 + %2299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 2, !dbg !150 + %2300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 3, !dbg !150 + %2301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 4, !dbg !150 + %2302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 5, !dbg !150 + %2303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 6, !dbg !150 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 7, !dbg !150 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 8, !dbg !150 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 9, !dbg !150 + %2307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 10, !dbg !150 + %2308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 11, !dbg !150 + %2309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 12, !dbg !150 + %2310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 13, !dbg !150 + %2311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 14, !dbg !150 + %2312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 15, !dbg !150 + %2313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 16, !dbg !150 + %2314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 17, !dbg !150 + %2315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 18, !dbg !150 + %2316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 19, !dbg !150 + %2317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 20, !dbg !150 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 21, !dbg !150 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 22, !dbg !150 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 23, !dbg !150 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 24, !dbg !150 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 25, !dbg !150 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 26, !dbg !150 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 27, !dbg !150 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 28, !dbg !150 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 29, !dbg !150 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 30, !dbg !150 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 31, !dbg !150 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 32, !dbg !150 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 33, !dbg !150 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 34, !dbg !150 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 35, !dbg !150 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 36, !dbg !150 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 37, !dbg !150 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 38, !dbg !150 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 39, !dbg !150 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 40, !dbg !150 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 41, !dbg !150 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 42, !dbg !150 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 43, !dbg !150 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 44, !dbg !150 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 45, !dbg !150 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 46, !dbg !150 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 47, !dbg !150 + %2345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 48, !dbg !150 + %2346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 49, !dbg !150 + %2347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 50, !dbg !150 + %2348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 51, !dbg !150 + %2349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 52, !dbg !150 + %2350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 53, !dbg !150 + %2351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 54, !dbg !150 + %2352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 55, !dbg !150 + %2353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 56, !dbg !150 + %2354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 57, !dbg !150 + %2355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 58, !dbg !150 + %2356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 59, !dbg !150 + %2357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 60, !dbg !150 + %2358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 61, !dbg !150 + %2359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 62, !dbg !150 + %2360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 63, !dbg !150 + %2361 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2297, float %2298, float %2299, float %2300, float %2301, float %2302, float %2303, float %2304, float %2305, float %2306, float %2307, float %2308, float %2309, float %2310, float %2311, float %2312, float %2313, float %2314, float %2315, float %2316, float %2317, float %2318, float %2319, float %2320, float %2321, float %2322, float %2323, float %2324, float %2325, float %2326, float %2327, float %2328, float %2329, float %2330, float %2331, float %2332, float %2333, float %2334, float %2335, float %2336, float %2337, float %2338, float %2339, float %2340, float %2341, float %2342, float %2343, float %2344, float %2345, float %2346, float %2347, float %2348, float %2349, float %2350, float %2351, float %2352, float %2353, float %2354, float %2355, float %2356, float %2357, float %2358, float %2359, float %2360, i32 %2213, i32 %2214, i32 %2215, i32 %2216, i64 %2296, i1 true) #3, !dbg !150 + %2362 = add i32 %640, 6144, !dbg !150 + %2363 = lshr exact i32 %2362, 4, !dbg !150 + %2364 = and i32 %2363, 16383, !dbg !150 + %2365 = zext nneg i32 %2364 to i64, !dbg !150 + %2366 = or disjoint i64 %2365, 4611686293338849280, !dbg !150 + %2367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 0, !dbg !150 + %2368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 1, !dbg !150 + %2369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 2, !dbg !150 + %2370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 3, !dbg !150 + %2371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 4, !dbg !150 + %2372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 5, !dbg !150 + %2373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 6, !dbg !150 + %2374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 7, !dbg !150 + %2375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 8, !dbg !150 + %2376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 9, !dbg !150 + %2377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 10, !dbg !150 + %2378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 11, !dbg !150 + %2379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 12, !dbg !150 + %2380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 13, !dbg !150 + %2381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 14, !dbg !150 + %2382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 15, !dbg !150 + %2383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 16, !dbg !150 + %2384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 17, !dbg !150 + %2385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 18, !dbg !150 + %2386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 19, !dbg !150 + %2387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 20, !dbg !150 + %2388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 21, !dbg !150 + %2389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 22, !dbg !150 + %2390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 23, !dbg !150 + %2391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 24, !dbg !150 + %2392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 25, !dbg !150 + %2393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 26, !dbg !150 + %2394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 27, !dbg !150 + %2395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 28, !dbg !150 + %2396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 29, !dbg !150 + %2397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 30, !dbg !150 + %2398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 31, !dbg !150 + %2399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 32, !dbg !150 + %2400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 33, !dbg !150 + %2401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 34, !dbg !150 + %2402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 35, !dbg !150 + %2403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 36, !dbg !150 + %2404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 37, !dbg !150 + %2405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 38, !dbg !150 + %2406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 39, !dbg !150 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 40, !dbg !150 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 41, !dbg !150 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 42, !dbg !150 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 43, !dbg !150 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 44, !dbg !150 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 45, !dbg !150 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 46, !dbg !150 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 47, !dbg !150 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 48, !dbg !150 + %2416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 49, !dbg !150 + %2417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 50, !dbg !150 + %2418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 51, !dbg !150 + %2419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 52, !dbg !150 + %2420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 53, !dbg !150 + %2421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 54, !dbg !150 + %2422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 55, !dbg !150 + %2423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 56, !dbg !150 + %2424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 57, !dbg !150 + %2425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 58, !dbg !150 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 59, !dbg !150 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 60, !dbg !150 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 61, !dbg !150 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 62, !dbg !150 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 63, !dbg !150 + %2431 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378, float %2379, float %2380, float %2381, float %2382, float %2383, float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, float %2393, float %2394, float %2395, float %2396, float %2397, float %2398, float %2399, float %2400, float %2401, float %2402, float %2403, float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, float %2416, float %2417, float %2418, float %2419, float %2420, float %2421, float %2422, float %2423, float %2424, float %2425, float %2426, float %2427, float %2428, float %2429, float %2430, i32 %2217, i32 %2218, i32 %2219, i32 %2220, i64 %2366, i1 true) #3, !dbg !150 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 0, !dbg !150 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 1, !dbg !150 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 2, !dbg !150 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 3, !dbg !150 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 4, !dbg !150 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 5, !dbg !150 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 6, !dbg !150 + %2439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 7, !dbg !150 + %2440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 8, !dbg !150 + %2441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 9, !dbg !150 + %2442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 10, !dbg !150 + %2443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 11, !dbg !150 + %2444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 12, !dbg !150 + %2445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 13, !dbg !150 + %2446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 14, !dbg !150 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 15, !dbg !150 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 16, !dbg !150 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 17, !dbg !150 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 18, !dbg !150 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 19, !dbg !150 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 20, !dbg !150 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 21, !dbg !150 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 22, !dbg !150 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 23, !dbg !150 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 24, !dbg !150 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 25, !dbg !150 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 26, !dbg !150 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 27, !dbg !150 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 28, !dbg !150 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 29, !dbg !150 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 30, !dbg !150 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 31, !dbg !150 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 32, !dbg !150 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 33, !dbg !150 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 34, !dbg !150 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 35, !dbg !150 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 36, !dbg !150 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 37, !dbg !150 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 38, !dbg !150 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 39, !dbg !150 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 40, !dbg !150 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 41, !dbg !150 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 42, !dbg !150 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 43, !dbg !150 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 44, !dbg !150 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 45, !dbg !150 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 46, !dbg !150 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 47, !dbg !150 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 48, !dbg !150 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 49, !dbg !150 + %2482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 50, !dbg !150 + %2483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 51, !dbg !150 + %2484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 52, !dbg !150 + %2485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 53, !dbg !150 + %2486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 54, !dbg !150 + %2487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 55, !dbg !150 + %2488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 56, !dbg !150 + %2489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 57, !dbg !150 + %2490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 58, !dbg !150 + %2491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 59, !dbg !150 + %2492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 60, !dbg !150 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 61, !dbg !150 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 62, !dbg !150 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 63, !dbg !150 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !150 + %2496 = insertelement <2 x i32> poison, i32 %541, i64 0, !dbg !116 + %2497 = shufflevector <2 x i32> %2496, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !116 + %2498 = add <2 x i32> %2497, %616, !dbg !116 + %2499 = add <2 x i32> %2497, %615, !dbg !116 + %2500 = add <2 x i32> %2497, %614, !dbg !116 + %2501 = add <2 x i32> %2497, %613, !dbg !116 + %2502 = add <2 x i32> %2497, %612, !dbg !116 + %2503 = add <2 x i32> %2497, %611, !dbg !116 + %2504 = add <2 x i32> %2497, %610, !dbg !116 + %2505 = add <2 x i32> %2497, %609, !dbg !116 + %2506 = add nuw nsw i32 %608, 1, !dbg !110 + %2507 = lshr i32 %2506, 1, !dbg !151 + %2508 = zext nneg i32 %2507 to i64, !dbg !152 + %2509 = getelementptr i32, ptr addrspace(1) %384, i64 %2508, !dbg !152 + %2510 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !153 + %2511 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2509, i64 %2510, i1 %618) #3, !dbg !153 + %2512 = add nuw nsw i32 %2507, 1, !dbg !154 + %2513 = icmp slt i32 %2512, %389, !dbg !155 + %2514 = getelementptr i8, ptr addrspace(1) %2509, i64 4, !dbg !156 + %2515 = and i1 %618, %2513, !dbg !110 + %2516 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !157 + %2517 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2514, i64 %2516, i1 %2515) #3, !dbg !157 + %2518 = and i32 %608, 1, !dbg !158 + %2519 = sub i32 %2517, %2511, !dbg !159 + %2520 = shl i32 %2519, 7, !dbg !160 + %2521 = add i32 %2520, -64, !dbg !161 + %2522 = xor i32 %2518, 1, !dbg !162 + %2523 = mul nuw nsw i32 %2521, %2522, !dbg !162 + %2524 = shl nuw nsw i32 %2518, 6, !dbg !163 + %2525 = add i32 %2523, %2524, !dbg !164 + %2526 = shl i32 %2525, 7, !dbg !165 + %2527 = sext i32 %2526 to i64, !dbg !114 + %2528 = getelementptr bfloat, ptr addrspace(1) %.pn8741535, i64 %2527, !dbg !114 + %2529 = getelementptr bfloat, ptr addrspace(1) %.pn8581536, i64 %2527, !dbg !114 + %2530 = getelementptr bfloat, ptr addrspace(1) %.pn8421537, i64 %2527, !dbg !114 + %2531 = getelementptr bfloat, ptr addrspace(1) %.pn8261538, i64 %2527, !dbg !114 + %2532 = getelementptr bfloat, ptr addrspace(1) %.pn9461543, i64 %2527, !dbg !115 + %2533 = getelementptr bfloat, ptr addrspace(1) %.pn9301544, i64 %2527, !dbg !115 + %2534 = getelementptr bfloat, ptr addrspace(1) %.pn9141545, i64 %2527, !dbg !115 + %2535 = getelementptr bfloat, ptr addrspace(1) %.pn8981546, i64 %2527, !dbg !115 + %2536 = add i32 %2525, %.pn8821539, !dbg !116 + %2537 = add i32 %2525, %.pn8801540, !dbg !116 + %2538 = add i32 %2525, %.pn8781541, !dbg !116 + %2539 = add i32 %2525, %.pn8761542, !dbg !116 + %2540 = add i32 %543, 1, !dbg !110 + %2541 = icmp sgt i32 %2540, 2, !dbg !110 + %2542 = select i1 %2541, i32 0, i32 %2540, !dbg !110 + %2543 = icmp slt i32 %2536, %19, !dbg !112 + %2544 = icmp slt i32 %2537, %19, !dbg !112 + %2545 = icmp slt i32 %2538, %19, !dbg !112 + %2546 = icmp slt i32 %2539, %19, !dbg !112 + %2547 = shl i32 %2542, 13, !dbg !113 + %2548 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2547, !dbg !113 + %2549 = and i1 %617, %2543, !dbg !110 + %2550 = and i1 %617, %2544, !dbg !110 + %2551 = and i1 %617, %2545, !dbg !110 + %2552 = and i1 %617, %2546, !dbg !110 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %2553 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %448, !dbg !113 + %2554 = select i1 %2549, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2553, ptr addrspace(1) %2528, i32 %2554) #3, !dbg !113 + %2555 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %451, !dbg !113 + %2556 = select i1 %2550, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2555, ptr addrspace(1) %2529, i32 %2556) #3, !dbg !113 + %2557 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %454, !dbg !113 + %2558 = select i1 %2551, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2557, ptr addrspace(1) %2530, i32 %2558) #3, !dbg !113 + %2559 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %457, !dbg !113 + %2560 = select i1 %2552, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2559, ptr addrspace(1) %2531, i32 %2560) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %2561 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2547, !dbg !113 + %2562 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %448, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2562, ptr addrspace(1) %2532, i32 %2554) #3, !dbg !113 + %2563 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %451, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2563, ptr addrspace(1) %2533, i32 %2556) #3, !dbg !113 + %2564 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %454, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2564, ptr addrspace(1) %2534, i32 %2558) #3, !dbg !113 + %2565 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %457, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2565, ptr addrspace(1) %2535, i32 %2560) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %exitcond.not = icmp eq i32 %2506, %smax, !dbg !110 + br i1 %exitcond.not, label %._crit_edge, label %540, !dbg !110 + +._crit_edge: ; preds = %__nv_exp2f.exit1527, %73 + %2566 = phi float [ 0.000000e+00, %73 ], [ %2432, %__nv_exp2f.exit1527 ] + %2567 = phi float [ 0.000000e+00, %73 ], [ %2433, %__nv_exp2f.exit1527 ] + %2568 = phi float [ 0.000000e+00, %73 ], [ %2434, %__nv_exp2f.exit1527 ] + %2569 = phi float [ 0.000000e+00, %73 ], [ %2435, %__nv_exp2f.exit1527 ] + %2570 = phi float [ 0.000000e+00, %73 ], [ %2436, %__nv_exp2f.exit1527 ] + %2571 = phi float [ 0.000000e+00, %73 ], [ %2437, %__nv_exp2f.exit1527 ] + %2572 = phi float [ 0.000000e+00, %73 ], [ %2438, %__nv_exp2f.exit1527 ] + %2573 = phi float [ 0.000000e+00, %73 ], [ %2439, %__nv_exp2f.exit1527 ] + %2574 = phi float [ 0.000000e+00, %73 ], [ %2440, %__nv_exp2f.exit1527 ] + %2575 = phi float [ 0.000000e+00, %73 ], [ %2441, %__nv_exp2f.exit1527 ] + %2576 = phi float [ 0.000000e+00, %73 ], [ %2442, %__nv_exp2f.exit1527 ] + %2577 = phi float [ 0.000000e+00, %73 ], [ %2443, %__nv_exp2f.exit1527 ] + %2578 = phi float [ 0.000000e+00, %73 ], [ %2444, %__nv_exp2f.exit1527 ] + %2579 = phi float [ 0.000000e+00, %73 ], [ %2445, %__nv_exp2f.exit1527 ] + %2580 = phi float [ 0.000000e+00, %73 ], [ %2446, %__nv_exp2f.exit1527 ] + %2581 = phi float [ 0.000000e+00, %73 ], [ %2447, %__nv_exp2f.exit1527 ] + %2582 = phi float [ 0.000000e+00, %73 ], [ %2448, %__nv_exp2f.exit1527 ] + %2583 = phi float [ 0.000000e+00, %73 ], [ %2449, %__nv_exp2f.exit1527 ] + %2584 = phi float [ 0.000000e+00, %73 ], [ %2450, %__nv_exp2f.exit1527 ] + %2585 = phi float [ 0.000000e+00, %73 ], [ %2451, %__nv_exp2f.exit1527 ] + %2586 = phi float [ 0.000000e+00, %73 ], [ %2452, %__nv_exp2f.exit1527 ] + %2587 = phi float [ 0.000000e+00, %73 ], [ %2453, %__nv_exp2f.exit1527 ] + %2588 = phi float [ 0.000000e+00, %73 ], [ %2454, %__nv_exp2f.exit1527 ] + %2589 = phi float [ 0.000000e+00, %73 ], [ %2455, %__nv_exp2f.exit1527 ] + %2590 = phi float [ 0.000000e+00, %73 ], [ %2456, %__nv_exp2f.exit1527 ] + %2591 = phi float [ 0.000000e+00, %73 ], [ %2457, %__nv_exp2f.exit1527 ] + %2592 = phi float [ 0.000000e+00, %73 ], [ %2458, %__nv_exp2f.exit1527 ] + %2593 = phi float [ 0.000000e+00, %73 ], [ %2459, %__nv_exp2f.exit1527 ] + %2594 = phi float [ 0.000000e+00, %73 ], [ %2460, %__nv_exp2f.exit1527 ] + %2595 = phi float [ 0.000000e+00, %73 ], [ %2461, %__nv_exp2f.exit1527 ] + %2596 = phi float [ 0.000000e+00, %73 ], [ %2462, %__nv_exp2f.exit1527 ] + %2597 = phi float [ 0.000000e+00, %73 ], [ %2463, %__nv_exp2f.exit1527 ] + %2598 = phi float [ 0.000000e+00, %73 ], [ %2464, %__nv_exp2f.exit1527 ] + %2599 = phi float [ 0.000000e+00, %73 ], [ %2465, %__nv_exp2f.exit1527 ] + %2600 = phi float [ 0.000000e+00, %73 ], [ %2466, %__nv_exp2f.exit1527 ] + %2601 = phi float [ 0.000000e+00, %73 ], [ %2467, %__nv_exp2f.exit1527 ] + %2602 = phi float [ 0.000000e+00, %73 ], [ %2468, %__nv_exp2f.exit1527 ] + %2603 = phi float [ 0.000000e+00, %73 ], [ %2469, %__nv_exp2f.exit1527 ] + %2604 = phi float [ 0.000000e+00, %73 ], [ %2470, %__nv_exp2f.exit1527 ] + %2605 = phi float [ 0.000000e+00, %73 ], [ %2471, %__nv_exp2f.exit1527 ] + %2606 = phi float [ 0.000000e+00, %73 ], [ %2472, %__nv_exp2f.exit1527 ] + %2607 = phi float [ 0.000000e+00, %73 ], [ %2473, %__nv_exp2f.exit1527 ] + %2608 = phi float [ 0.000000e+00, %73 ], [ %2474, %__nv_exp2f.exit1527 ] + %2609 = phi float [ 0.000000e+00, %73 ], [ %2475, %__nv_exp2f.exit1527 ] + %2610 = phi float [ 0.000000e+00, %73 ], [ %2476, %__nv_exp2f.exit1527 ] + %2611 = phi float [ 0.000000e+00, %73 ], [ %2477, %__nv_exp2f.exit1527 ] + %2612 = phi float [ 0.000000e+00, %73 ], [ %2478, %__nv_exp2f.exit1527 ] + %2613 = phi float [ 0.000000e+00, %73 ], [ %2479, %__nv_exp2f.exit1527 ] + %2614 = phi float [ 0.000000e+00, %73 ], [ %2480, %__nv_exp2f.exit1527 ] + %2615 = phi float [ 0.000000e+00, %73 ], [ %2481, %__nv_exp2f.exit1527 ] + %2616 = phi float [ 0.000000e+00, %73 ], [ %2482, %__nv_exp2f.exit1527 ] + %2617 = phi float [ 0.000000e+00, %73 ], [ %2483, %__nv_exp2f.exit1527 ] + %2618 = phi float [ 0.000000e+00, %73 ], [ %2484, %__nv_exp2f.exit1527 ] + %2619 = phi float [ 0.000000e+00, %73 ], [ %2485, %__nv_exp2f.exit1527 ] + %2620 = phi float [ 0.000000e+00, %73 ], [ %2486, %__nv_exp2f.exit1527 ] + %2621 = phi float [ 0.000000e+00, %73 ], [ %2487, %__nv_exp2f.exit1527 ] + %2622 = phi float [ 0.000000e+00, %73 ], [ %2488, %__nv_exp2f.exit1527 ] + %2623 = phi float [ 0.000000e+00, %73 ], [ %2489, %__nv_exp2f.exit1527 ] + %2624 = phi float [ 0.000000e+00, %73 ], [ %2490, %__nv_exp2f.exit1527 ] + %2625 = phi float [ 0.000000e+00, %73 ], [ %2491, %__nv_exp2f.exit1527 ] + %2626 = phi float [ 0.000000e+00, %73 ], [ %2492, %__nv_exp2f.exit1527 ] + %2627 = phi float [ 0.000000e+00, %73 ], [ %2493, %__nv_exp2f.exit1527 ] + %2628 = phi float [ 0.000000e+00, %73 ], [ %2494, %__nv_exp2f.exit1527 ] + %2629 = phi float [ 0.000000e+00, %73 ], [ %2495, %__nv_exp2f.exit1527 ] + %2630 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2566, float %2567, float %2568, float %2569, float %2570, float %2571, float %2572, float %2573, float %2574, float %2575, float %2576, float %2577, float %2578, float %2579, float %2580, float %2581, float %2582, float %2583, float %2584, float %2585, float %2586, float %2587, float %2588, float %2589, float %2590, float %2591, float %2592, float %2593, float %2594, float %2595, float %2596, float %2597, float %2598, float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, float %2611, float %2612, float %2613, float %2614, float %2615, float %2616, float %2617, float %2618, float %2619, float %2620, float %2621, float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, float %2629) #3, !dbg !110 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !110 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !110 + %2631 = getelementptr i32, ptr addrspace(1) %13, i64 %383, !dbg !166 + %2632 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2631) #3, !dbg !167 + %2633 = shl i32 %2632, 7, !dbg !168 + %2634 = getelementptr i32, ptr addrspace(1) %12, i64 %387, !dbg !169 + %2635 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2634) #3, !dbg !170 + %2636 = or disjoint i32 %2633, %59, !dbg !171 + %2637 = or disjoint i32 %2633, %60, !dbg !171 + %2638 = or disjoint i32 %2633, %61, !dbg !171 + %2639 = or disjoint i32 %2633, %62, !dbg !171 + %2640 = shl i32 %2636, 7, !dbg !172 + %2641 = shl i32 %2637, 7, !dbg !172 + %2642 = shl i32 %2638, 7, !dbg !172 + %2643 = shl i32 %2639, 7, !dbg !172 + %2644 = sext i32 %2640 to i64, !dbg !174 + %2645 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2644, !dbg !174 + %2646 = sext i32 %2641 to i64, !dbg !174 + %2647 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2646, !dbg !174 + %2648 = sext i32 %2642 to i64, !dbg !174 + %2649 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2648, !dbg !174 + %2650 = sext i32 %2643 to i64, !dbg !174 + %2651 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2650, !dbg !174 + %2652 = getelementptr bfloat, ptr addrspace(1) %2645, i64 %138, !dbg !175 + %2653 = getelementptr bfloat, ptr addrspace(1) %2647, i64 %138, !dbg !175 + %2654 = getelementptr bfloat, ptr addrspace(1) %2649, i64 %138, !dbg !175 + %2655 = getelementptr bfloat, ptr addrspace(1) %2651, i64 %138, !dbg !175 + %2656 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2644, !dbg !176 + %2657 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2646, !dbg !176 + %2658 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2648, !dbg !176 + %2659 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2650, !dbg !176 + %2660 = getelementptr bfloat, ptr addrspace(1) %2656, i64 %138, !dbg !177 + %2661 = getelementptr bfloat, ptr addrspace(1) %2657, i64 %138, !dbg !177 + %2662 = getelementptr bfloat, ptr addrspace(1) %2658, i64 %138, !dbg !177 + %2663 = getelementptr bfloat, ptr addrspace(1) %2659, i64 %138, !dbg !177 + %2664 = shl i32 %2635, 1, !dbg !178 + %2665 = tail call i32 @llvm.smin.i32(i32 %2664, i32 %433), !dbg !179 + %2666 = icmp sgt i32 %2664, 0, !dbg !180 + %2667 = icmp slt i32 %2636, %19, !dbg !181 + %2668 = icmp slt i32 %2637, %19, !dbg !181 + %2669 = icmp slt i32 %2638, %19, !dbg !181 + %2670 = icmp slt i32 %2639, %19, !dbg !181 + %2671 = and i1 %2666, %2667, !dbg !180 + %2672 = and i1 %2666, %2668, !dbg !180 + %2673 = and i1 %2666, %2669, !dbg !180 + %2674 = and i1 %2666, %2670, !dbg !180 + %2675 = select i1 %2671, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %449, ptr addrspace(1) %2652, i32 %2675) #3, !dbg !182 + %2676 = select i1 %2672, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %452, ptr addrspace(1) %2653, i32 %2676) #3, !dbg !182 + %2677 = select i1 %2673, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %455, ptr addrspace(1) %2654, i32 %2677) #3, !dbg !182 + %2678 = select i1 %2674, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %458, ptr addrspace(1) %2655, i32 %2678) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %460, ptr addrspace(1) %2660, i32 %2675) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %461, ptr addrspace(1) %2661, i32 %2676) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %462, ptr addrspace(1) %2662, i32 %2677) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %463, ptr addrspace(1) %2663, i32 %2678) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + %2679 = icmp sgt i32 %2665, 1, !dbg !180 + %2680 = getelementptr i8, ptr addrspace(1) %2652, i64 16384, !dbg !183 + %2681 = getelementptr i8, ptr addrspace(1) %2653, i64 16384, !dbg !183 + %2682 = getelementptr i8, ptr addrspace(1) %2654, i64 16384, !dbg !183 + %2683 = getelementptr i8, ptr addrspace(1) %2655, i64 16384, !dbg !183 + %2684 = getelementptr i8, ptr addrspace(1) %2660, i64 16384, !dbg !184 + %2685 = getelementptr i8, ptr addrspace(1) %2661, i64 16384, !dbg !184 + %2686 = getelementptr i8, ptr addrspace(1) %2662, i64 16384, !dbg !184 + %2687 = getelementptr i8, ptr addrspace(1) %2663, i64 16384, !dbg !184 + %2688 = or disjoint i32 %2636, 64, !dbg !185 + %2689 = or disjoint i32 %2637, 64, !dbg !185 + %2690 = or disjoint i32 %2638, 64, !dbg !185 + %2691 = or disjoint i32 %2639, 64, !dbg !185 + %2692 = icmp slt i32 %2688, %19, !dbg !181 + %2693 = icmp slt i32 %2689, %19, !dbg !181 + %2694 = icmp slt i32 %2690, %19, !dbg !181 + %2695 = icmp slt i32 %2691, %19, !dbg !181 + %2696 = and i1 %2679, %2692, !dbg !180 + %2697 = and i1 %2679, %2693, !dbg !180 + %2698 = and i1 %2679, %2694, !dbg !180 + %2699 = and i1 %2679, %2695, !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !182 + %2700 = select i1 %2696, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %485, ptr addrspace(1) %2680, i32 %2700) #3, !dbg !182 + %2701 = select i1 %2697, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %487, ptr addrspace(1) %2681, i32 %2701) #3, !dbg !182 + %2702 = select i1 %2698, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %489, ptr addrspace(1) %2682, i32 %2702) #3, !dbg !182 + %2703 = select i1 %2699, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %491, ptr addrspace(1) %2683, i32 %2703) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %493, ptr addrspace(1) %2684, i32 %2700) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %494, ptr addrspace(1) %2685, i32 %2701) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %495, ptr addrspace(1) %2686, i32 %2702) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %496, ptr addrspace(1) %2687, i32 %2703) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !186 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 0, !dbg !180 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 1, !dbg !180 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 2, !dbg !180 + %2707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 3, !dbg !180 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 4, !dbg !180 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 5, !dbg !180 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 6, !dbg !180 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 7, !dbg !180 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 8, !dbg !180 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 9, !dbg !180 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 10, !dbg !180 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 11, !dbg !180 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 12, !dbg !180 + %2717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 13, !dbg !180 + %2718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 14, !dbg !180 + %2719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 15, !dbg !180 + %2720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 16, !dbg !180 + %2721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 17, !dbg !180 + %2722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 18, !dbg !180 + %2723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 19, !dbg !180 + %2724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 20, !dbg !180 + %2725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 21, !dbg !180 + %2726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 22, !dbg !180 + %2727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 23, !dbg !180 + %2728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 24, !dbg !180 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 25, !dbg !180 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 26, !dbg !180 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 27, !dbg !180 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 28, !dbg !180 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 29, !dbg !180 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 30, !dbg !180 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 31, !dbg !180 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 32, !dbg !180 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 33, !dbg !180 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 34, !dbg !180 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 35, !dbg !180 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 36, !dbg !180 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 37, !dbg !180 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 38, !dbg !180 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 39, !dbg !180 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 40, !dbg !180 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 41, !dbg !180 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 42, !dbg !180 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 43, !dbg !180 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 44, !dbg !180 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 45, !dbg !180 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 46, !dbg !180 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 47, !dbg !180 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 48, !dbg !180 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 49, !dbg !180 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 50, !dbg !180 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 51, !dbg !180 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 52, !dbg !180 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 53, !dbg !180 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 54, !dbg !180 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 55, !dbg !180 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 56, !dbg !180 + %2761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 57, !dbg !180 + %2762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 58, !dbg !180 + %2763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 59, !dbg !180 + %2764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 60, !dbg !180 + %2765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 61, !dbg !180 + %2766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 62, !dbg !180 + %2767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 63, !dbg !180 + br i1 %2666, label %.lr.ph1592, label %._crit_edge1593, !dbg !180 + +.lr.ph1592: ; preds = %._crit_edge + %2768 = insertelement <16 x i32> poison, i32 %2633, i64 0, !dbg !171 + %2769 = shufflevector <16 x i32> %2768, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !171 + %2770 = shufflevector <2 x i32> %395, <2 x i32> poison, <16 x i32> , !dbg !171 + %2771 = insertelement <16 x i32> %2770, i32 %392, i64 14, !dbg !171 + %2772 = insertelement <16 x i32> %2771, i32 %391, i64 15, !dbg !171 + %2773 = shufflevector <8 x i32> %401, <8 x i32> poison, <16 x i32> , !dbg !171 + %2774 = shufflevector <16 x i32> %2773, <16 x i32> %2772, <16 x i32> , !dbg !171 + %2775 = shufflevector <4 x i32> %398, <4 x i32> poison, <16 x i32> , !dbg !171 + %2776 = shufflevector <16 x i32> %2774, <16 x i32> %2775, <16 x i32> , !dbg !171 + %2777 = or disjoint <16 x i32> %2769, %2776, !dbg !171 + %2778 = add nsw i32 %2665, -2 + %2779 = add nsw i32 %2665, -1 + %smax2183 = tail call i32 @llvm.smax.i32(i32 %2665, i32 1), !dbg !180 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 0, !dbg !180 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 1, !dbg !180 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 2, !dbg !180 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 3, !dbg !180 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 4, !dbg !180 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 5, !dbg !180 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 6, !dbg !180 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 7, !dbg !180 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 8, !dbg !180 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 9, !dbg !180 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 10, !dbg !180 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 11, !dbg !180 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 12, !dbg !180 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 13, !dbg !180 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 14, !dbg !180 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 15, !dbg !180 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 16, !dbg !180 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 17, !dbg !180 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 18, !dbg !180 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 19, !dbg !180 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 20, !dbg !180 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 21, !dbg !180 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 22, !dbg !180 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 23, !dbg !180 + %2804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 24, !dbg !180 + %2805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 25, !dbg !180 + %2806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 26, !dbg !180 + %2807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 27, !dbg !180 + %2808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 28, !dbg !180 + %2809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 29, !dbg !180 + %2810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 30, !dbg !180 + %2811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 31, !dbg !180 + %2812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 32, !dbg !180 + %2813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 33, !dbg !180 + %2814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 34, !dbg !180 + %2815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 35, !dbg !180 + %2816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 36, !dbg !180 + %2817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 37, !dbg !180 + %2818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 38, !dbg !180 + %2819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 39, !dbg !180 + %2820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 40, !dbg !180 + %2821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 41, !dbg !180 + %2822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 42, !dbg !180 + %2823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 43, !dbg !180 + %2824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 44, !dbg !180 + %2825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 45, !dbg !180 + %2826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 46, !dbg !180 + %2827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 47, !dbg !180 + %2828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 48, !dbg !180 + %2829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 49, !dbg !180 + %2830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 50, !dbg !180 + %2831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 51, !dbg !180 + %2832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 52, !dbg !180 + %2833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 53, !dbg !180 + %2834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 54, !dbg !180 + %2835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 55, !dbg !180 + %2836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 56, !dbg !180 + %2837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 57, !dbg !180 + %2838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 58, !dbg !180 + %2839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 59, !dbg !180 + %2840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 60, !dbg !180 + %2841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 61, !dbg !180 + %2842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 62, !dbg !180 + %2843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 63, !dbg !180 + br label %2844, !dbg !180 + +2844: ; preds = %.lr.ph1592, %__nv_exp2f.exit1431 + %2845 = phi i32 [ 64, %.lr.ph1592 ], [ %4415, %__nv_exp2f.exit1431 ] + %2846 = phi i32 [ -1, %.lr.ph1592 ], [ %2854, %__nv_exp2f.exit1431 ] + %2847 = phi i32 [ 1, %.lr.ph1592 ], [ %4432, %__nv_exp2f.exit1431 ] + %.pn10661574 = phi ptr addrspace(1) [ %2687, %.lr.ph1592 ], [ %4425, %__nv_exp2f.exit1431 ] + %.pn10821573 = phi ptr addrspace(1) [ %2686, %.lr.ph1592 ], [ %4424, %__nv_exp2f.exit1431 ] + %.pn10981572 = phi ptr addrspace(1) [ %2685, %.lr.ph1592 ], [ %4423, %__nv_exp2f.exit1431 ] + %.pn11141571 = phi ptr addrspace(1) [ %2684, %.lr.ph1592 ], [ %4422, %__nv_exp2f.exit1431 ] + %.pn10441570 = phi i32 [ %2691, %.lr.ph1592 ], [ %4429, %__nv_exp2f.exit1431 ] + %.pn10461569 = phi i32 [ %2690, %.lr.ph1592 ], [ %4428, %__nv_exp2f.exit1431 ] + %.pn10481568 = phi i32 [ %2689, %.lr.ph1592 ], [ %4427, %__nv_exp2f.exit1431 ] + %.pn10501567 = phi i32 [ %2688, %.lr.ph1592 ], [ %4426, %__nv_exp2f.exit1431 ] + %.pn9941566 = phi ptr addrspace(1) [ %2683, %.lr.ph1592 ], [ %4421, %__nv_exp2f.exit1431 ] + %.pn10101565 = phi ptr addrspace(1) [ %2682, %.lr.ph1592 ], [ %4420, %__nv_exp2f.exit1431 ] + %.pn10261564 = phi ptr addrspace(1) [ %2681, %.lr.ph1592 ], [ %4419, %__nv_exp2f.exit1431 ] + %.pn10421563 = phi ptr addrspace(1) [ %2680, %.lr.ph1592 ], [ %4418, %__nv_exp2f.exit1431 ] + %.pn = phi float [ %2780, %.lr.ph1592 ], [ %4329, %__nv_exp2f.exit1431 ] + %.pn2450 = phi float [ %2781, %.lr.ph1592 ], [ %4330, %__nv_exp2f.exit1431 ] + %.pn2451 = phi float [ %2782, %.lr.ph1592 ], [ %4331, %__nv_exp2f.exit1431 ] + %.pn2452 = phi float [ %2783, %.lr.ph1592 ], [ %4332, %__nv_exp2f.exit1431 ] + %.pn2453 = phi float [ %2784, %.lr.ph1592 ], [ %4333, %__nv_exp2f.exit1431 ] + %.pn2454 = phi float [ %2785, %.lr.ph1592 ], [ %4334, %__nv_exp2f.exit1431 ] + %.pn2455 = phi float [ %2786, %.lr.ph1592 ], [ %4335, %__nv_exp2f.exit1431 ] + %.pn2456 = phi float [ %2787, %.lr.ph1592 ], [ %4336, %__nv_exp2f.exit1431 ] + %.pn2457 = phi float [ %2788, %.lr.ph1592 ], [ %4337, %__nv_exp2f.exit1431 ] + %.pn2458 = phi float [ %2789, %.lr.ph1592 ], [ %4338, %__nv_exp2f.exit1431 ] + %.pn2459 = phi float [ %2790, %.lr.ph1592 ], [ %4339, %__nv_exp2f.exit1431 ] + %.pn2460 = phi float [ %2791, %.lr.ph1592 ], [ %4340, %__nv_exp2f.exit1431 ] + %.pn2461 = phi float [ %2792, %.lr.ph1592 ], [ %4341, %__nv_exp2f.exit1431 ] + %.pn2462 = phi float [ %2793, %.lr.ph1592 ], [ %4342, %__nv_exp2f.exit1431 ] + %.pn2463 = phi float [ %2794, %.lr.ph1592 ], [ %4343, %__nv_exp2f.exit1431 ] + %.pn2464 = phi float [ %2795, %.lr.ph1592 ], [ %4344, %__nv_exp2f.exit1431 ] + %.pn2465 = phi float [ %2796, %.lr.ph1592 ], [ %4345, %__nv_exp2f.exit1431 ] + %.pn2466 = phi float [ %2797, %.lr.ph1592 ], [ %4346, %__nv_exp2f.exit1431 ] + %.pn2467 = phi float [ %2798, %.lr.ph1592 ], [ %4347, %__nv_exp2f.exit1431 ] + %.pn2468 = phi float [ %2799, %.lr.ph1592 ], [ %4348, %__nv_exp2f.exit1431 ] + %.pn2469 = phi float [ %2800, %.lr.ph1592 ], [ %4349, %__nv_exp2f.exit1431 ] + %.pn2470 = phi float [ %2801, %.lr.ph1592 ], [ %4350, %__nv_exp2f.exit1431 ] + %.pn2471 = phi float [ %2802, %.lr.ph1592 ], [ %4351, %__nv_exp2f.exit1431 ] + %.pn2472 = phi float [ %2803, %.lr.ph1592 ], [ %4352, %__nv_exp2f.exit1431 ] + %.pn2473 = phi float [ %2804, %.lr.ph1592 ], [ %4353, %__nv_exp2f.exit1431 ] + %.pn2474 = phi float [ %2805, %.lr.ph1592 ], [ %4354, %__nv_exp2f.exit1431 ] + %.pn2475 = phi float [ %2806, %.lr.ph1592 ], [ %4355, %__nv_exp2f.exit1431 ] + %.pn2476 = phi float [ %2807, %.lr.ph1592 ], [ %4356, %__nv_exp2f.exit1431 ] + %.pn2477 = phi float [ %2808, %.lr.ph1592 ], [ %4357, %__nv_exp2f.exit1431 ] + %.pn2478 = phi float [ %2809, %.lr.ph1592 ], [ %4358, %__nv_exp2f.exit1431 ] + %.pn2479 = phi float [ %2810, %.lr.ph1592 ], [ %4359, %__nv_exp2f.exit1431 ] + %.pn2480 = phi float [ %2811, %.lr.ph1592 ], [ %4360, %__nv_exp2f.exit1431 ] + %.pn2481 = phi float [ %2812, %.lr.ph1592 ], [ %4361, %__nv_exp2f.exit1431 ] + %.pn2482 = phi float [ %2813, %.lr.ph1592 ], [ %4362, %__nv_exp2f.exit1431 ] + %.pn2483 = phi float [ %2814, %.lr.ph1592 ], [ %4363, %__nv_exp2f.exit1431 ] + %.pn2484 = phi float [ %2815, %.lr.ph1592 ], [ %4364, %__nv_exp2f.exit1431 ] + %.pn2485 = phi float [ %2816, %.lr.ph1592 ], [ %4365, %__nv_exp2f.exit1431 ] + %.pn2486 = phi float [ %2817, %.lr.ph1592 ], [ %4366, %__nv_exp2f.exit1431 ] + %.pn2487 = phi float [ %2818, %.lr.ph1592 ], [ %4367, %__nv_exp2f.exit1431 ] + %.pn2488 = phi float [ %2819, %.lr.ph1592 ], [ %4368, %__nv_exp2f.exit1431 ] + %.pn2489 = phi float [ %2820, %.lr.ph1592 ], [ %4369, %__nv_exp2f.exit1431 ] + %.pn2490 = phi float [ %2821, %.lr.ph1592 ], [ %4370, %__nv_exp2f.exit1431 ] + %.pn2491 = phi float [ %2822, %.lr.ph1592 ], [ %4371, %__nv_exp2f.exit1431 ] + %.pn2492 = phi float [ %2823, %.lr.ph1592 ], [ %4372, %__nv_exp2f.exit1431 ] + %.pn2493 = phi float [ %2824, %.lr.ph1592 ], [ %4373, %__nv_exp2f.exit1431 ] + %.pn2494 = phi float [ %2825, %.lr.ph1592 ], [ %4374, %__nv_exp2f.exit1431 ] + %.pn2495 = phi float [ %2826, %.lr.ph1592 ], [ %4375, %__nv_exp2f.exit1431 ] + %.pn2496 = phi float [ %2827, %.lr.ph1592 ], [ %4376, %__nv_exp2f.exit1431 ] + %.pn2497 = phi float [ %2828, %.lr.ph1592 ], [ %4377, %__nv_exp2f.exit1431 ] + %.pn2498 = phi float [ %2829, %.lr.ph1592 ], [ %4378, %__nv_exp2f.exit1431 ] + %.pn2499 = phi float [ %2830, %.lr.ph1592 ], [ %4379, %__nv_exp2f.exit1431 ] + %.pn2500 = phi float [ %2831, %.lr.ph1592 ], [ %4380, %__nv_exp2f.exit1431 ] + %.pn2501 = phi float [ %2832, %.lr.ph1592 ], [ %4381, %__nv_exp2f.exit1431 ] + %.pn2502 = phi float [ %2833, %.lr.ph1592 ], [ %4382, %__nv_exp2f.exit1431 ] + %.pn2503 = phi float [ %2834, %.lr.ph1592 ], [ %4383, %__nv_exp2f.exit1431 ] + %.pn2504 = phi float [ %2835, %.lr.ph1592 ], [ %4384, %__nv_exp2f.exit1431 ] + %.pn2505 = phi float [ %2836, %.lr.ph1592 ], [ %4385, %__nv_exp2f.exit1431 ] + %.pn2506 = phi float [ %2837, %.lr.ph1592 ], [ %4386, %__nv_exp2f.exit1431 ] + %.pn2507 = phi float [ %2838, %.lr.ph1592 ], [ %4387, %__nv_exp2f.exit1431 ] + %.pn2508 = phi float [ %2839, %.lr.ph1592 ], [ %4388, %__nv_exp2f.exit1431 ] + %.pn2509 = phi float [ %2840, %.lr.ph1592 ], [ %4389, %__nv_exp2f.exit1431 ] + %.pn2510 = phi float [ %2841, %.lr.ph1592 ], [ %4390, %__nv_exp2f.exit1431 ] + %.pn2511 = phi float [ %2842, %.lr.ph1592 ], [ %4391, %__nv_exp2f.exit1431 ] + %.pn2512 = phi float [ %2843, %.lr.ph1592 ], [ %4392, %__nv_exp2f.exit1431 ] + %2848 = phi i32 [ 0, %.lr.ph1592 ], [ %4396, %__nv_exp2f.exit1431 ] + %2849 = phi <16 x i32> [ %2777, %.lr.ph1592 ], [ %4395, %__nv_exp2f.exit1431 ] + %2850 = icmp slt i32 %2848, %2778, !dbg !180 + %2851 = icmp slt i32 %2848, %2779, !dbg !180 + %2852 = add i32 %2846, 1, !dbg !180 + %2853 = icmp sgt i32 %2852, 2, !dbg !180 + %2854 = select i1 %2853, i32 0, i32 %2852, !dbg !180 + %2855 = extractelement <16 x i32> %2849, i64 15, !dbg !181 + %2856 = icmp slt i32 %2855, %19, !dbg !181 + %2857 = extractelement <16 x i32> %2849, i64 14, !dbg !181 + %2858 = icmp slt i32 %2857, %19, !dbg !181 + %2859 = extractelement <16 x i32> %2849, i64 13, !dbg !181 + %2860 = icmp slt i32 %2859, %19, !dbg !181 + %2861 = extractelement <16 x i32> %2849, i64 12, !dbg !181 + %2862 = icmp slt i32 %2861, %19, !dbg !181 + %2863 = extractelement <16 x i32> %2849, i64 11, !dbg !181 + %2864 = icmp slt i32 %2863, %19, !dbg !181 + %2865 = extractelement <16 x i32> %2849, i64 10, !dbg !181 + %2866 = icmp slt i32 %2865, %19, !dbg !181 + %2867 = extractelement <16 x i32> %2849, i64 9, !dbg !181 + %2868 = icmp slt i32 %2867, %19, !dbg !181 + %2869 = extractelement <16 x i32> %2849, i64 8, !dbg !181 + %2870 = icmp slt i32 %2869, %19, !dbg !181 + %2871 = extractelement <16 x i32> %2849, i64 7, !dbg !181 + %2872 = icmp slt i32 %2871, %19, !dbg !181 + %2873 = extractelement <16 x i32> %2849, i64 6, !dbg !181 + %2874 = icmp slt i32 %2873, %19, !dbg !181 + %2875 = extractelement <16 x i32> %2849, i64 5, !dbg !181 + %2876 = icmp slt i32 %2875, %19, !dbg !181 + %2877 = extractelement <16 x i32> %2849, i64 4, !dbg !181 + %2878 = icmp slt i32 %2877, %19, !dbg !181 + %2879 = extractelement <16 x i32> %2849, i64 3, !dbg !181 + %2880 = icmp slt i32 %2879, %19, !dbg !181 + %2881 = extractelement <16 x i32> %2849, i64 2, !dbg !181 + %2882 = icmp slt i32 %2881, %19, !dbg !181 + %2883 = extractelement <16 x i32> %2849, i64 1, !dbg !181 + %2884 = icmp slt i32 %2883, %19, !dbg !181 + %2885 = extractelement <16 x i32> %2849, i64 0, !dbg !181 + %2886 = icmp slt i32 %2885, %19, !dbg !181 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !182 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !182 + %2887 = shl i32 %2854, 13, !dbg !182 + %2888 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2887, !dbg !182 + %2889 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !186 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !186 + %2890 = shl i32 %2889, 11, !dbg !186 + %2891 = and i32 %2890, 8192, !dbg !186 + %2892 = add i32 %2891, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2893 = lshr exact i32 %2892, 4, !dbg !186 + %2894 = and i32 %2893, 16383, !dbg !186 + %2895 = zext nneg i32 %2894 to i64, !dbg !186 + %2896 = or disjoint i64 %2895, 4611686293372403712, !dbg !186 + %2897 = ptrtoint ptr addrspace(3) %2888 to i32, !dbg !186 + %2898 = lshr exact i32 %2897, 4, !dbg !186 + %2899 = and i32 %2898, 16383, !dbg !186 + %2900 = zext nneg i32 %2899 to i64, !dbg !186 + %2901 = or disjoint i64 %2900, 4611686293338849280, !dbg !186 + %2902 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2896, i64 %2901) #3, !dbg !186 + %2903 = or disjoint i32 %2891, 32, !dbg !186 + %2904 = add i32 %2903, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2905 = lshr exact i32 %2904, 4, !dbg !186 + %2906 = and i32 %2905, 16383, !dbg !186 + %2907 = zext nneg i32 %2906 to i64, !dbg !186 + %2908 = or disjoint i64 %2907, 4611686293372403712, !dbg !186 + %2909 = add i32 %2897, 32, !dbg !186 + %2910 = lshr exact i32 %2909, 4, !dbg !186 + %2911 = and i32 %2910, 16383, !dbg !186 + %2912 = zext nneg i32 %2911 to i64, !dbg !186 + %2913 = or disjoint i64 %2912, 4611686293338849280, !dbg !186 + %2914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 0, !dbg !186 + %2915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 1, !dbg !186 + %2916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 2, !dbg !186 + %2917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 3, !dbg !186 + %2918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 4, !dbg !186 + %2919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 5, !dbg !186 + %2920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 6, !dbg !186 + %2921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 7, !dbg !186 + %2922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 8, !dbg !186 + %2923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 9, !dbg !186 + %2924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 10, !dbg !186 + %2925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 11, !dbg !186 + %2926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 12, !dbg !186 + %2927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 13, !dbg !186 + %2928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 14, !dbg !186 + %2929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 15, !dbg !186 + %2930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 16, !dbg !186 + %2931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 17, !dbg !186 + %2932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 18, !dbg !186 + %2933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 19, !dbg !186 + %2934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 20, !dbg !186 + %2935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 21, !dbg !186 + %2936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 22, !dbg !186 + %2937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 23, !dbg !186 + %2938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 24, !dbg !186 + %2939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 25, !dbg !186 + %2940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 26, !dbg !186 + %2941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 27, !dbg !186 + %2942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 28, !dbg !186 + %2943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 29, !dbg !186 + %2944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 30, !dbg !186 + %2945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 31, !dbg !186 + %2946 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2914, float %2915, float %2916, float %2917, float %2918, float %2919, float %2920, float %2921, float %2922, float %2923, float %2924, float %2925, float %2926, float %2927, float %2928, float %2929, float %2930, float %2931, float %2932, float %2933, float %2934, float %2935, float %2936, float %2937, float %2938, float %2939, float %2940, float %2941, float %2942, float %2943, float %2944, float %2945, i64 %2908, i64 %2913, i1 true) #3, !dbg !186 + %2947 = or disjoint i32 %2891, 64, !dbg !186 + %2948 = add i32 %2947, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2949 = lshr exact i32 %2948, 4, !dbg !186 + %2950 = and i32 %2949, 16383, !dbg !186 + %2951 = zext nneg i32 %2950 to i64, !dbg !186 + %2952 = or disjoint i64 %2951, 4611686293372403712, !dbg !186 + %2953 = add i32 %2897, 64, !dbg !186 + %2954 = lshr exact i32 %2953, 4, !dbg !186 + %2955 = and i32 %2954, 16383, !dbg !186 + %2956 = zext nneg i32 %2955 to i64, !dbg !186 + %2957 = or disjoint i64 %2956, 4611686293338849280, !dbg !186 + %2958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 0, !dbg !186 + %2959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 1, !dbg !186 + %2960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 2, !dbg !186 + %2961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 3, !dbg !186 + %2962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 4, !dbg !186 + %2963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 5, !dbg !186 + %2964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 6, !dbg !186 + %2965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 7, !dbg !186 + %2966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 8, !dbg !186 + %2967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 9, !dbg !186 + %2968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 10, !dbg !186 + %2969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 11, !dbg !186 + %2970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 12, !dbg !186 + %2971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 13, !dbg !186 + %2972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 14, !dbg !186 + %2973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 15, !dbg !186 + %2974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 16, !dbg !186 + %2975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 17, !dbg !186 + %2976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 18, !dbg !186 + %2977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 19, !dbg !186 + %2978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 20, !dbg !186 + %2979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 21, !dbg !186 + %2980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 22, !dbg !186 + %2981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 23, !dbg !186 + %2982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 24, !dbg !186 + %2983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 25, !dbg !186 + %2984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 26, !dbg !186 + %2985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 27, !dbg !186 + %2986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 28, !dbg !186 + %2987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 29, !dbg !186 + %2988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 30, !dbg !186 + %2989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 31, !dbg !186 + %2990 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2958, float %2959, float %2960, float %2961, float %2962, float %2963, float %2964, float %2965, float %2966, float %2967, float %2968, float %2969, float %2970, float %2971, float %2972, float %2973, float %2974, float %2975, float %2976, float %2977, float %2978, float %2979, float %2980, float %2981, float %2982, float %2983, float %2984, float %2985, float %2986, float %2987, float %2988, float %2989, i64 %2952, i64 %2957, i1 true) #3, !dbg !186 + %2991 = or disjoint i32 %2891, 96, !dbg !186 + %2992 = add i32 %2991, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2993 = lshr exact i32 %2992, 4, !dbg !186 + %2994 = and i32 %2993, 16383, !dbg !186 + %2995 = zext nneg i32 %2994 to i64, !dbg !186 + %2996 = or disjoint i64 %2995, 4611686293372403712, !dbg !186 + %2997 = add i32 %2897, 96, !dbg !186 + %2998 = lshr exact i32 %2997, 4, !dbg !186 + %2999 = and i32 %2998, 16383, !dbg !186 + %3000 = zext nneg i32 %2999 to i64, !dbg !186 + %3001 = or disjoint i64 %3000, 4611686293338849280, !dbg !186 + %3002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 0, !dbg !186 + %3003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 1, !dbg !186 + %3004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 2, !dbg !186 + %3005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 3, !dbg !186 + %3006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 4, !dbg !186 + %3007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 5, !dbg !186 + %3008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 6, !dbg !186 + %3009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 7, !dbg !186 + %3010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 8, !dbg !186 + %3011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 9, !dbg !186 + %3012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 10, !dbg !186 + %3013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 11, !dbg !186 + %3014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 12, !dbg !186 + %3015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 13, !dbg !186 + %3016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 14, !dbg !186 + %3017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 15, !dbg !186 + %3018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 16, !dbg !186 + %3019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 17, !dbg !186 + %3020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 18, !dbg !186 + %3021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 19, !dbg !186 + %3022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 20, !dbg !186 + %3023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 21, !dbg !186 + %3024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 22, !dbg !186 + %3025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 23, !dbg !186 + %3026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 24, !dbg !186 + %3027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 25, !dbg !186 + %3028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 26, !dbg !186 + %3029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 27, !dbg !186 + %3030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 28, !dbg !186 + %3031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 29, !dbg !186 + %3032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 30, !dbg !186 + %3033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 31, !dbg !186 + %3034 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3002, float %3003, float %3004, float %3005, float %3006, float %3007, float %3008, float %3009, float %3010, float %3011, float %3012, float %3013, float %3014, float %3015, float %3016, float %3017, float %3018, float %3019, float %3020, float %3021, float %3022, float %3023, float %3024, float %3025, float %3026, float %3027, float %3028, float %3029, float %3030, float %3031, float %3032, float %3033, i64 %2996, i64 %3001, i1 true) #3, !dbg !186 + %3035 = or disjoint i32 %2891, 16384, !dbg !186 + %3036 = add i32 %3035, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3037 = lshr exact i32 %3036, 4, !dbg !186 + %3038 = and i32 %3037, 16383, !dbg !186 + %3039 = zext nneg i32 %3038 to i64, !dbg !186 + %3040 = or disjoint i64 %3039, 4611686293372403712, !dbg !186 + %3041 = add i32 %2897, 8192, !dbg !186 + %3042 = lshr exact i32 %3041, 4, !dbg !186 + %3043 = and i32 %3042, 16383, !dbg !186 + %3044 = zext nneg i32 %3043 to i64, !dbg !186 + %3045 = or disjoint i64 %3044, 4611686293338849280, !dbg !186 + %3046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 0, !dbg !186 + %3047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 1, !dbg !186 + %3048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 2, !dbg !186 + %3049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 3, !dbg !186 + %3050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 4, !dbg !186 + %3051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 5, !dbg !186 + %3052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 6, !dbg !186 + %3053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 7, !dbg !186 + %3054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 8, !dbg !186 + %3055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 9, !dbg !186 + %3056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 10, !dbg !186 + %3057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 11, !dbg !186 + %3058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 12, !dbg !186 + %3059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 13, !dbg !186 + %3060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 14, !dbg !186 + %3061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 15, !dbg !186 + %3062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 16, !dbg !186 + %3063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 17, !dbg !186 + %3064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 18, !dbg !186 + %3065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 19, !dbg !186 + %3066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 20, !dbg !186 + %3067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 21, !dbg !186 + %3068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 22, !dbg !186 + %3069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 23, !dbg !186 + %3070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 24, !dbg !186 + %3071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 25, !dbg !186 + %3072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 26, !dbg !186 + %3073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 27, !dbg !186 + %3074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 28, !dbg !186 + %3075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 29, !dbg !186 + %3076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 30, !dbg !186 + %3077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 31, !dbg !186 + %3078 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3046, float %3047, float %3048, float %3049, float %3050, float %3051, float %3052, float %3053, float %3054, float %3055, float %3056, float %3057, float %3058, float %3059, float %3060, float %3061, float %3062, float %3063, float %3064, float %3065, float %3066, float %3067, float %3068, float %3069, float %3070, float %3071, float %3072, float %3073, float %3074, float %3075, float %3076, float %3077, i64 %3040, i64 %3045, i1 true) #3, !dbg !186 + %3079 = or disjoint i32 %2891, 16416, !dbg !186 + %3080 = add i32 %3079, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3081 = lshr exact i32 %3080, 4, !dbg !186 + %3082 = and i32 %3081, 16383, !dbg !186 + %3083 = zext nneg i32 %3082 to i64, !dbg !186 + %3084 = or disjoint i64 %3083, 4611686293372403712, !dbg !186 + %3085 = add i32 %2897, 8224, !dbg !186 + %3086 = lshr exact i32 %3085, 4, !dbg !186 + %3087 = and i32 %3086, 16383, !dbg !186 + %3088 = zext nneg i32 %3087 to i64, !dbg !186 + %3089 = or disjoint i64 %3088, 4611686293338849280, !dbg !186 + %3090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 0, !dbg !186 + %3091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 1, !dbg !186 + %3092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 2, !dbg !186 + %3093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 3, !dbg !186 + %3094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 4, !dbg !186 + %3095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 5, !dbg !186 + %3096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 6, !dbg !186 + %3097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 7, !dbg !186 + %3098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 8, !dbg !186 + %3099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 9, !dbg !186 + %3100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 10, !dbg !186 + %3101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 11, !dbg !186 + %3102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 12, !dbg !186 + %3103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 13, !dbg !186 + %3104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 14, !dbg !186 + %3105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 15, !dbg !186 + %3106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 16, !dbg !186 + %3107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 17, !dbg !186 + %3108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 18, !dbg !186 + %3109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 19, !dbg !186 + %3110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 20, !dbg !186 + %3111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 21, !dbg !186 + %3112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 22, !dbg !186 + %3113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 23, !dbg !186 + %3114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 24, !dbg !186 + %3115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 25, !dbg !186 + %3116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 26, !dbg !186 + %3117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 27, !dbg !186 + %3118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 28, !dbg !186 + %3119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 29, !dbg !186 + %3120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 30, !dbg !186 + %3121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 31, !dbg !186 + %3122 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3090, float %3091, float %3092, float %3093, float %3094, float %3095, float %3096, float %3097, float %3098, float %3099, float %3100, float %3101, float %3102, float %3103, float %3104, float %3105, float %3106, float %3107, float %3108, float %3109, float %3110, float %3111, float %3112, float %3113, float %3114, float %3115, float %3116, float %3117, float %3118, float %3119, float %3120, float %3121, i64 %3084, i64 %3089, i1 true) #3, !dbg !186 + %3123 = or disjoint i32 %2891, 16448, !dbg !186 + %3124 = add i32 %3123, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3125 = lshr exact i32 %3124, 4, !dbg !186 + %3126 = and i32 %3125, 16383, !dbg !186 + %3127 = zext nneg i32 %3126 to i64, !dbg !186 + %3128 = or disjoint i64 %3127, 4611686293372403712, !dbg !186 + %3129 = add i32 %2897, 8256, !dbg !186 + %3130 = lshr exact i32 %3129, 4, !dbg !186 + %3131 = and i32 %3130, 16383, !dbg !186 + %3132 = zext nneg i32 %3131 to i64, !dbg !186 + %3133 = or disjoint i64 %3132, 4611686293338849280, !dbg !186 + %3134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 0, !dbg !186 + %3135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 1, !dbg !186 + %3136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 2, !dbg !186 + %3137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 3, !dbg !186 + %3138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 4, !dbg !186 + %3139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 5, !dbg !186 + %3140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 6, !dbg !186 + %3141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 7, !dbg !186 + %3142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 8, !dbg !186 + %3143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 9, !dbg !186 + %3144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 10, !dbg !186 + %3145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 11, !dbg !186 + %3146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 12, !dbg !186 + %3147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 13, !dbg !186 + %3148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 14, !dbg !186 + %3149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 15, !dbg !186 + %3150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 16, !dbg !186 + %3151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 17, !dbg !186 + %3152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 18, !dbg !186 + %3153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 19, !dbg !186 + %3154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 20, !dbg !186 + %3155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 21, !dbg !186 + %3156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 22, !dbg !186 + %3157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 23, !dbg !186 + %3158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 24, !dbg !186 + %3159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 25, !dbg !186 + %3160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 26, !dbg !186 + %3161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 27, !dbg !186 + %3162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 28, !dbg !186 + %3163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 29, !dbg !186 + %3164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 30, !dbg !186 + %3165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 31, !dbg !186 + %3166 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3134, float %3135, float %3136, float %3137, float %3138, float %3139, float %3140, float %3141, float %3142, float %3143, float %3144, float %3145, float %3146, float %3147, float %3148, float %3149, float %3150, float %3151, float %3152, float %3153, float %3154, float %3155, float %3156, float %3157, float %3158, float %3159, float %3160, float %3161, float %3162, float %3163, float %3164, float %3165, i64 %3128, i64 %3133, i1 true) #3, !dbg !186 + %3167 = or disjoint i32 %2891, 16480, !dbg !186 + %3168 = add i32 %3167, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3169 = lshr exact i32 %3168, 4, !dbg !186 + %3170 = and i32 %3169, 16383, !dbg !186 + %3171 = zext nneg i32 %3170 to i64, !dbg !186 + %3172 = or disjoint i64 %3171, 4611686293372403712, !dbg !186 + %3173 = add i32 %2897, 8288, !dbg !186 + %3174 = lshr exact i32 %3173, 4, !dbg !186 + %3175 = and i32 %3174, 16383, !dbg !186 + %3176 = zext nneg i32 %3175 to i64, !dbg !186 + %3177 = or disjoint i64 %3176, 4611686293338849280, !dbg !186 + %3178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 0, !dbg !186 + %3179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 1, !dbg !186 + %3180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 2, !dbg !186 + %3181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 3, !dbg !186 + %3182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 4, !dbg !186 + %3183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 5, !dbg !186 + %3184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 6, !dbg !186 + %3185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 7, !dbg !186 + %3186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 8, !dbg !186 + %3187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 9, !dbg !186 + %3188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 10, !dbg !186 + %3189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 11, !dbg !186 + %3190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 12, !dbg !186 + %3191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 13, !dbg !186 + %3192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 14, !dbg !186 + %3193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 15, !dbg !186 + %3194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 16, !dbg !186 + %3195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 17, !dbg !186 + %3196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 18, !dbg !186 + %3197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 19, !dbg !186 + %3198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 20, !dbg !186 + %3199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 21, !dbg !186 + %3200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 22, !dbg !186 + %3201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 23, !dbg !186 + %3202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 24, !dbg !186 + %3203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 25, !dbg !186 + %3204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 26, !dbg !186 + %3205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 27, !dbg !186 + %3206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 28, !dbg !186 + %3207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 29, !dbg !186 + %3208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 30, !dbg !186 + %3209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 31, !dbg !186 + %3210 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3178, float %3179, float %3180, float %3181, float %3182, float %3183, float %3184, float %3185, float %3186, float %3187, float %3188, float %3189, float %3190, float %3191, float %3192, float %3193, float %3194, float %3195, float %3196, float %3197, float %3198, float %3199, float %3200, float %3201, float %3202, float %3203, float %3204, float %3205, float %3206, float %3207, float %3208, float %3209, i64 %3172, i64 %3177, i1 true) #3, !dbg !186 + %3211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 0, !dbg !186 + %3212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 1, !dbg !186 + %3213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 2, !dbg !186 + %3214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 3, !dbg !186 + %3215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 4, !dbg !186 + %3216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 5, !dbg !186 + %3217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 6, !dbg !186 + %3218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 7, !dbg !186 + %3219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 8, !dbg !186 + %3220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 9, !dbg !186 + %3221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 10, !dbg !186 + %3222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 11, !dbg !186 + %3223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 12, !dbg !186 + %3224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 13, !dbg !186 + %3225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 14, !dbg !186 + %3226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 15, !dbg !186 + %3227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 16, !dbg !186 + %3228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 17, !dbg !186 + %3229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 18, !dbg !186 + %3230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 19, !dbg !186 + %3231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 20, !dbg !186 + %3232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 21, !dbg !186 + %3233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 22, !dbg !186 + %3234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 23, !dbg !186 + %3235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 24, !dbg !186 + %3236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 25, !dbg !186 + %3237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 26, !dbg !186 + %3238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 27, !dbg !186 + %3239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 28, !dbg !186 + %3240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 29, !dbg !186 + %3241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 30, !dbg !186 + %3242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 31, !dbg !186 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !186 + %3243 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3211, float %3212, float %3213, float %3214, float %3215, float %3216, float %3217, float %3218, float %3219, float %3220, float %3221, float %3222, float %3223, float %3224, float %3225, float %3226, float %3227, float %3228, float %3229, float %3230, float %3231, float %3232, float %3233, float %3234, float %3235, float %3236, float %3237, float %3238, float %3239, float %3240, float %3241, float %3242, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2888, i32 0, i32 0) #3, !dbg !186 + %3244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 0, !dbg !186 + %3245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 1, !dbg !186 + %3246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 2, !dbg !186 + %3247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 3, !dbg !186 + %3248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 4, !dbg !186 + %3249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 5, !dbg !186 + %3250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 6, !dbg !186 + %3251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 7, !dbg !186 + %3252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 8, !dbg !186 + %3253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 9, !dbg !186 + %3254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 10, !dbg !186 + %3255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 11, !dbg !186 + %3256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 12, !dbg !186 + %3257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 13, !dbg !186 + %3258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 14, !dbg !186 + %3259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 15, !dbg !186 + %3260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 16, !dbg !186 + %3261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 17, !dbg !186 + %3262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 18, !dbg !186 + %3263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 19, !dbg !186 + %3264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 20, !dbg !186 + %3265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 21, !dbg !186 + %3266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 22, !dbg !186 + %3267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 23, !dbg !186 + %3268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 24, !dbg !186 + %3269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 25, !dbg !186 + %3270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 26, !dbg !186 + %3271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 27, !dbg !186 + %3272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 28, !dbg !186 + %3273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 29, !dbg !186 + %3274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 30, !dbg !186 + %3275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 31, !dbg !186 + %3276 = fmul float %3244, 0x3FB6A09E60000000, !dbg !187 + %3277 = fmul float %3245, 0x3FB6A09E60000000, !dbg !187 + %3278 = fmul float %3246, 0x3FB6A09E60000000, !dbg !187 + %3279 = fmul float %3247, 0x3FB6A09E60000000, !dbg !187 + %3280 = fmul float %3248, 0x3FB6A09E60000000, !dbg !187 + %3281 = fmul float %3249, 0x3FB6A09E60000000, !dbg !187 + %3282 = fmul float %3250, 0x3FB6A09E60000000, !dbg !187 + %3283 = fmul float %3251, 0x3FB6A09E60000000, !dbg !187 + %3284 = fmul float %3252, 0x3FB6A09E60000000, !dbg !187 + %3285 = fmul float %3253, 0x3FB6A09E60000000, !dbg !187 + %3286 = fmul float %3254, 0x3FB6A09E60000000, !dbg !187 + %3287 = fmul float %3255, 0x3FB6A09E60000000, !dbg !187 + %3288 = fmul float %3256, 0x3FB6A09E60000000, !dbg !187 + %3289 = fmul float %3257, 0x3FB6A09E60000000, !dbg !187 + %3290 = fmul float %3258, 0x3FB6A09E60000000, !dbg !187 + %3291 = fmul float %3259, 0x3FB6A09E60000000, !dbg !187 + %3292 = fmul float %3260, 0x3FB6A09E60000000, !dbg !187 + %3293 = fmul float %3261, 0x3FB6A09E60000000, !dbg !187 + %3294 = fmul float %3262, 0x3FB6A09E60000000, !dbg !187 + %3295 = fmul float %3263, 0x3FB6A09E60000000, !dbg !187 + %3296 = fmul float %3264, 0x3FB6A09E60000000, !dbg !187 + %3297 = fmul float %3265, 0x3FB6A09E60000000, !dbg !187 + %3298 = fmul float %3266, 0x3FB6A09E60000000, !dbg !187 + %3299 = fmul float %3267, 0x3FB6A09E60000000, !dbg !187 + %3300 = fmul float %3268, 0x3FB6A09E60000000, !dbg !187 + %3301 = fmul float %3269, 0x3FB6A09E60000000, !dbg !187 + %3302 = fmul float %3270, 0x3FB6A09E60000000, !dbg !187 + %3303 = fmul float %3271, 0x3FB6A09E60000000, !dbg !187 + %3304 = fmul float %3272, 0x3FB6A09E60000000, !dbg !187 + %3305 = fmul float %3273, 0x3FB6A09E60000000, !dbg !187 + %3306 = fmul float %3274, 0x3FB6A09E60000000, !dbg !187 + %3307 = fmul float %3275, 0x3FB6A09E60000000, !dbg !187 + %3308 = fmul float %3276, 0x3FF7154760000000, !dbg !188 + %3309 = select i1 %2856, float %3308, float 0xFFF0000000000000, !dbg !189 + %3310 = fmul float %3277, 0x3FF7154760000000, !dbg !188 + %3311 = select i1 %2858, float %3310, float 0xFFF0000000000000, !dbg !189 + %3312 = fmul float %3278, 0x3FF7154760000000, !dbg !188 + %3313 = select i1 %2856, float %3312, float 0xFFF0000000000000, !dbg !189 + %3314 = fmul float %3279, 0x3FF7154760000000, !dbg !188 + %3315 = select i1 %2858, float %3314, float 0xFFF0000000000000, !dbg !189 + %3316 = fmul float %3280, 0x3FF7154760000000, !dbg !188 + %3317 = select i1 %2860, float %3316, float 0xFFF0000000000000, !dbg !189 + %3318 = fmul float %3281, 0x3FF7154760000000, !dbg !188 + %3319 = select i1 %2862, float %3318, float 0xFFF0000000000000, !dbg !189 + %3320 = fmul float %3282, 0x3FF7154760000000, !dbg !188 + %3321 = select i1 %2860, float %3320, float 0xFFF0000000000000, !dbg !189 + %3322 = fmul float %3283, 0x3FF7154760000000, !dbg !188 + %3323 = select i1 %2862, float %3322, float 0xFFF0000000000000, !dbg !189 + %3324 = fmul float %3284, 0x3FF7154760000000, !dbg !188 + %3325 = select i1 %2864, float %3324, float 0xFFF0000000000000, !dbg !189 + %3326 = fmul float %3285, 0x3FF7154760000000, !dbg !188 + %3327 = select i1 %2866, float %3326, float 0xFFF0000000000000, !dbg !189 + %3328 = fmul float %3286, 0x3FF7154760000000, !dbg !188 + %3329 = select i1 %2864, float %3328, float 0xFFF0000000000000, !dbg !189 + %3330 = fmul float %3287, 0x3FF7154760000000, !dbg !188 + %3331 = select i1 %2866, float %3330, float 0xFFF0000000000000, !dbg !189 + %3332 = fmul float %3288, 0x3FF7154760000000, !dbg !188 + %3333 = select i1 %2868, float %3332, float 0xFFF0000000000000, !dbg !189 + %3334 = fmul float %3289, 0x3FF7154760000000, !dbg !188 + %3335 = select i1 %2870, float %3334, float 0xFFF0000000000000, !dbg !189 + %3336 = fmul float %3290, 0x3FF7154760000000, !dbg !188 + %3337 = select i1 %2868, float %3336, float 0xFFF0000000000000, !dbg !189 + %3338 = fmul float %3291, 0x3FF7154760000000, !dbg !188 + %3339 = select i1 %2870, float %3338, float 0xFFF0000000000000, !dbg !189 + %3340 = fmul float %3292, 0x3FF7154760000000, !dbg !188 + %3341 = select i1 %2872, float %3340, float 0xFFF0000000000000, !dbg !189 + %3342 = fmul float %3293, 0x3FF7154760000000, !dbg !188 + %3343 = select i1 %2874, float %3342, float 0xFFF0000000000000, !dbg !189 + %3344 = fmul float %3294, 0x3FF7154760000000, !dbg !188 + %3345 = select i1 %2872, float %3344, float 0xFFF0000000000000, !dbg !189 + %3346 = fmul float %3295, 0x3FF7154760000000, !dbg !188 + %3347 = select i1 %2874, float %3346, float 0xFFF0000000000000, !dbg !189 + %3348 = fmul float %3296, 0x3FF7154760000000, !dbg !188 + %3349 = select i1 %2876, float %3348, float 0xFFF0000000000000, !dbg !189 + %3350 = fmul float %3297, 0x3FF7154760000000, !dbg !188 + %3351 = select i1 %2878, float %3350, float 0xFFF0000000000000, !dbg !189 + %3352 = fmul float %3298, 0x3FF7154760000000, !dbg !188 + %3353 = select i1 %2876, float %3352, float 0xFFF0000000000000, !dbg !189 + %3354 = fmul float %3299, 0x3FF7154760000000, !dbg !188 + %3355 = select i1 %2878, float %3354, float 0xFFF0000000000000, !dbg !189 + %3356 = fmul float %3300, 0x3FF7154760000000, !dbg !188 + %3357 = select i1 %2880, float %3356, float 0xFFF0000000000000, !dbg !189 + %3358 = fmul float %3301, 0x3FF7154760000000, !dbg !188 + %3359 = select i1 %2882, float %3358, float 0xFFF0000000000000, !dbg !189 + %3360 = fmul float %3302, 0x3FF7154760000000, !dbg !188 + %3361 = select i1 %2880, float %3360, float 0xFFF0000000000000, !dbg !189 + %3362 = fmul float %3303, 0x3FF7154760000000, !dbg !188 + %3363 = select i1 %2882, float %3362, float 0xFFF0000000000000, !dbg !189 + %3364 = fmul float %3304, 0x3FF7154760000000, !dbg !188 + %3365 = select i1 %2884, float %3364, float 0xFFF0000000000000, !dbg !189 + %3366 = fmul float %3305, 0x3FF7154760000000, !dbg !188 + %3367 = select i1 %2886, float %3366, float 0xFFF0000000000000, !dbg !189 + %3368 = fmul float %3306, 0x3FF7154760000000, !dbg !188 + %3369 = select i1 %2884, float %3368, float 0xFFF0000000000000, !dbg !189 + %3370 = fmul float %3307, 0x3FF7154760000000, !dbg !188 + %3371 = select i1 %2886, float %3370, float 0xFFF0000000000000, !dbg !189 + %3372 = fsub float %3309, %381, !dbg !190 + %3373 = fsub float %3311, %381, !dbg !190 + %3374 = fsub float %3313, %382, !dbg !190 + %3375 = fsub float %3315, %382, !dbg !190 + %3376 = fsub float %3317, %381, !dbg !190 + %3377 = fsub float %3319, %381, !dbg !190 + %3378 = fsub float %3321, %382, !dbg !190 + %3379 = fsub float %3323, %382, !dbg !190 + %3380 = fsub float %3325, %381, !dbg !190 + %3381 = fsub float %3327, %381, !dbg !190 + %3382 = fsub float %3329, %382, !dbg !190 + %3383 = fsub float %3331, %382, !dbg !190 + %3384 = fsub float %3333, %381, !dbg !190 + %3385 = fsub float %3335, %381, !dbg !190 + %3386 = fsub float %3337, %382, !dbg !190 + %3387 = fsub float %3339, %382, !dbg !190 + %3388 = fsub float %3341, %381, !dbg !190 + %3389 = fsub float %3343, %381, !dbg !190 + %3390 = fsub float %3345, %382, !dbg !190 + %3391 = fsub float %3347, %382, !dbg !190 + %3392 = fsub float %3349, %381, !dbg !190 + %3393 = fsub float %3351, %381, !dbg !190 + %3394 = fsub float %3353, %382, !dbg !190 + %3395 = fsub float %3355, %382, !dbg !190 + %3396 = fsub float %3357, %381, !dbg !190 + %3397 = fsub float %3359, %381, !dbg !190 + %3398 = fsub float %3361, %382, !dbg !190 + %3399 = fsub float %3363, %382, !dbg !190 + %3400 = fsub float %3365, %381, !dbg !190 + %3401 = fsub float %3367, %381, !dbg !190 + %3402 = fsub float %3369, %382, !dbg !190 + %3403 = fsub float %3371, %382, !dbg !190 + %3404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1336 = icmp eq i32 %3404, 0, !dbg !191 + br i1 %.not.i1336, label %3407, label %3405, !dbg !191 + +3405: ; preds = %2844 + %3406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3372) #3, !dbg !191 + br label %__nv_exp2f.exit1338, !dbg !191 + +3407: ; preds = %2844 + %3408 = tail call float @llvm.nvvm.ex2.approx.f(float %3372) #3, !dbg !191 + br label %__nv_exp2f.exit1338, !dbg !191 + +__nv_exp2f.exit1338: ; preds = %3405, %3407 + %.0.i1337 = phi float [ %3406, %3405 ], [ %3408, %3407 ], !dbg !191 + %3409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1339 = icmp eq i32 %3409, 0, !dbg !191 + br i1 %.not.i1339, label %3412, label %3410, !dbg !191 + +3410: ; preds = %__nv_exp2f.exit1338 + %3411 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3373) #3, !dbg !191 + br label %__nv_exp2f.exit1341, !dbg !191 + +3412: ; preds = %__nv_exp2f.exit1338 + %3413 = tail call float @llvm.nvvm.ex2.approx.f(float %3373) #3, !dbg !191 + br label %__nv_exp2f.exit1341, !dbg !191 + +__nv_exp2f.exit1341: ; preds = %3410, %3412 + %.0.i1340 = phi float [ %3411, %3410 ], [ %3413, %3412 ], !dbg !191 + %3414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1342 = icmp eq i32 %3414, 0, !dbg !191 + br i1 %.not.i1342, label %3417, label %3415, !dbg !191 + +3415: ; preds = %__nv_exp2f.exit1341 + %3416 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3374) #3, !dbg !191 + br label %__nv_exp2f.exit1344, !dbg !191 + +3417: ; preds = %__nv_exp2f.exit1341 + %3418 = tail call float @llvm.nvvm.ex2.approx.f(float %3374) #3, !dbg !191 + br label %__nv_exp2f.exit1344, !dbg !191 + +__nv_exp2f.exit1344: ; preds = %3415, %3417 + %.0.i1343 = phi float [ %3416, %3415 ], [ %3418, %3417 ], !dbg !191 + %3419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1345 = icmp eq i32 %3419, 0, !dbg !191 + br i1 %.not.i1345, label %3422, label %3420, !dbg !191 + +3420: ; preds = %__nv_exp2f.exit1344 + %3421 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3375) #3, !dbg !191 + br label %__nv_exp2f.exit1347, !dbg !191 + +3422: ; preds = %__nv_exp2f.exit1344 + %3423 = tail call float @llvm.nvvm.ex2.approx.f(float %3375) #3, !dbg !191 + br label %__nv_exp2f.exit1347, !dbg !191 + +__nv_exp2f.exit1347: ; preds = %3420, %3422 + %.0.i1346 = phi float [ %3421, %3420 ], [ %3423, %3422 ], !dbg !191 + %3424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1348 = icmp eq i32 %3424, 0, !dbg !191 + br i1 %.not.i1348, label %3427, label %3425, !dbg !191 + +3425: ; preds = %__nv_exp2f.exit1347 + %3426 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3376) #3, !dbg !191 + br label %__nv_exp2f.exit1350, !dbg !191 + +3427: ; preds = %__nv_exp2f.exit1347 + %3428 = tail call float @llvm.nvvm.ex2.approx.f(float %3376) #3, !dbg !191 + br label %__nv_exp2f.exit1350, !dbg !191 + +__nv_exp2f.exit1350: ; preds = %3425, %3427 + %.0.i1349 = phi float [ %3426, %3425 ], [ %3428, %3427 ], !dbg !191 + %3429 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1351 = icmp eq i32 %3429, 0, !dbg !191 + br i1 %.not.i1351, label %3432, label %3430, !dbg !191 + +3430: ; preds = %__nv_exp2f.exit1350 + %3431 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3377) #3, !dbg !191 + br label %__nv_exp2f.exit1353, !dbg !191 + +3432: ; preds = %__nv_exp2f.exit1350 + %3433 = tail call float @llvm.nvvm.ex2.approx.f(float %3377) #3, !dbg !191 + br label %__nv_exp2f.exit1353, !dbg !191 + +__nv_exp2f.exit1353: ; preds = %3430, %3432 + %.0.i1352 = phi float [ %3431, %3430 ], [ %3433, %3432 ], !dbg !191 + %3434 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1354 = icmp eq i32 %3434, 0, !dbg !191 + br i1 %.not.i1354, label %3437, label %3435, !dbg !191 + +3435: ; preds = %__nv_exp2f.exit1353 + %3436 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3378) #3, !dbg !191 + br label %__nv_exp2f.exit1356, !dbg !191 + +3437: ; preds = %__nv_exp2f.exit1353 + %3438 = tail call float @llvm.nvvm.ex2.approx.f(float %3378) #3, !dbg !191 + br label %__nv_exp2f.exit1356, !dbg !191 + +__nv_exp2f.exit1356: ; preds = %3435, %3437 + %.0.i1355 = phi float [ %3436, %3435 ], [ %3438, %3437 ], !dbg !191 + %3439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1357 = icmp eq i32 %3439, 0, !dbg !191 + br i1 %.not.i1357, label %3442, label %3440, !dbg !191 + +3440: ; preds = %__nv_exp2f.exit1356 + %3441 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3379) #3, !dbg !191 + br label %__nv_exp2f.exit1359, !dbg !191 + +3442: ; preds = %__nv_exp2f.exit1356 + %3443 = tail call float @llvm.nvvm.ex2.approx.f(float %3379) #3, !dbg !191 + br label %__nv_exp2f.exit1359, !dbg !191 + +__nv_exp2f.exit1359: ; preds = %3440, %3442 + %.0.i1358 = phi float [ %3441, %3440 ], [ %3443, %3442 ], !dbg !191 + %3444 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1360 = icmp eq i32 %3444, 0, !dbg !191 + br i1 %.not.i1360, label %3447, label %3445, !dbg !191 + +3445: ; preds = %__nv_exp2f.exit1359 + %3446 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3380) #3, !dbg !191 + br label %__nv_exp2f.exit1362, !dbg !191 + +3447: ; preds = %__nv_exp2f.exit1359 + %3448 = tail call float @llvm.nvvm.ex2.approx.f(float %3380) #3, !dbg !191 + br label %__nv_exp2f.exit1362, !dbg !191 + +__nv_exp2f.exit1362: ; preds = %3445, %3447 + %.0.i1361 = phi float [ %3446, %3445 ], [ %3448, %3447 ], !dbg !191 + %3449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1363 = icmp eq i32 %3449, 0, !dbg !191 + br i1 %.not.i1363, label %3452, label %3450, !dbg !191 + +3450: ; preds = %__nv_exp2f.exit1362 + %3451 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3381) #3, !dbg !191 + br label %__nv_exp2f.exit1365, !dbg !191 + +3452: ; preds = %__nv_exp2f.exit1362 + %3453 = tail call float @llvm.nvvm.ex2.approx.f(float %3381) #3, !dbg !191 + br label %__nv_exp2f.exit1365, !dbg !191 + +__nv_exp2f.exit1365: ; preds = %3450, %3452 + %.0.i1364 = phi float [ %3451, %3450 ], [ %3453, %3452 ], !dbg !191 + %3454 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1366 = icmp eq i32 %3454, 0, !dbg !191 + br i1 %.not.i1366, label %3457, label %3455, !dbg !191 + +3455: ; preds = %__nv_exp2f.exit1365 + %3456 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3382) #3, !dbg !191 + br label %__nv_exp2f.exit1368, !dbg !191 + +3457: ; preds = %__nv_exp2f.exit1365 + %3458 = tail call float @llvm.nvvm.ex2.approx.f(float %3382) #3, !dbg !191 + br label %__nv_exp2f.exit1368, !dbg !191 + +__nv_exp2f.exit1368: ; preds = %3455, %3457 + %.0.i1367 = phi float [ %3456, %3455 ], [ %3458, %3457 ], !dbg !191 + %3459 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1369 = icmp eq i32 %3459, 0, !dbg !191 + br i1 %.not.i1369, label %3462, label %3460, !dbg !191 + +3460: ; preds = %__nv_exp2f.exit1368 + %3461 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3383) #3, !dbg !191 + br label %__nv_exp2f.exit1371, !dbg !191 + +3462: ; preds = %__nv_exp2f.exit1368 + %3463 = tail call float @llvm.nvvm.ex2.approx.f(float %3383) #3, !dbg !191 + br label %__nv_exp2f.exit1371, !dbg !191 + +__nv_exp2f.exit1371: ; preds = %3460, %3462 + %.0.i1370 = phi float [ %3461, %3460 ], [ %3463, %3462 ], !dbg !191 + %3464 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1372 = icmp eq i32 %3464, 0, !dbg !191 + br i1 %.not.i1372, label %3467, label %3465, !dbg !191 + +3465: ; preds = %__nv_exp2f.exit1371 + %3466 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3384) #3, !dbg !191 + br label %__nv_exp2f.exit1374, !dbg !191 + +3467: ; preds = %__nv_exp2f.exit1371 + %3468 = tail call float @llvm.nvvm.ex2.approx.f(float %3384) #3, !dbg !191 + br label %__nv_exp2f.exit1374, !dbg !191 + +__nv_exp2f.exit1374: ; preds = %3465, %3467 + %.0.i1373 = phi float [ %3466, %3465 ], [ %3468, %3467 ], !dbg !191 + %3469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1375 = icmp eq i32 %3469, 0, !dbg !191 + br i1 %.not.i1375, label %3472, label %3470, !dbg !191 + +3470: ; preds = %__nv_exp2f.exit1374 + %3471 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3385) #3, !dbg !191 + br label %__nv_exp2f.exit1377, !dbg !191 + +3472: ; preds = %__nv_exp2f.exit1374 + %3473 = tail call float @llvm.nvvm.ex2.approx.f(float %3385) #3, !dbg !191 + br label %__nv_exp2f.exit1377, !dbg !191 + +__nv_exp2f.exit1377: ; preds = %3470, %3472 + %.0.i1376 = phi float [ %3471, %3470 ], [ %3473, %3472 ], !dbg !191 + %3474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1378 = icmp eq i32 %3474, 0, !dbg !191 + br i1 %.not.i1378, label %3477, label %3475, !dbg !191 + +3475: ; preds = %__nv_exp2f.exit1377 + %3476 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3386) #3, !dbg !191 + br label %__nv_exp2f.exit1380, !dbg !191 + +3477: ; preds = %__nv_exp2f.exit1377 + %3478 = tail call float @llvm.nvvm.ex2.approx.f(float %3386) #3, !dbg !191 + br label %__nv_exp2f.exit1380, !dbg !191 + +__nv_exp2f.exit1380: ; preds = %3475, %3477 + %.0.i1379 = phi float [ %3476, %3475 ], [ %3478, %3477 ], !dbg !191 + %3479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1381 = icmp eq i32 %3479, 0, !dbg !191 + br i1 %.not.i1381, label %3482, label %3480, !dbg !191 + +3480: ; preds = %__nv_exp2f.exit1380 + %3481 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3387) #3, !dbg !191 + br label %__nv_exp2f.exit1383, !dbg !191 + +3482: ; preds = %__nv_exp2f.exit1380 + %3483 = tail call float @llvm.nvvm.ex2.approx.f(float %3387) #3, !dbg !191 + br label %__nv_exp2f.exit1383, !dbg !191 + +__nv_exp2f.exit1383: ; preds = %3480, %3482 + %.0.i1382 = phi float [ %3481, %3480 ], [ %3483, %3482 ], !dbg !191 + %3484 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1384 = icmp eq i32 %3484, 0, !dbg !191 + br i1 %.not.i1384, label %3487, label %3485, !dbg !191 + +3485: ; preds = %__nv_exp2f.exit1383 + %3486 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3388) #3, !dbg !191 + br label %__nv_exp2f.exit1386, !dbg !191 + +3487: ; preds = %__nv_exp2f.exit1383 + %3488 = tail call float @llvm.nvvm.ex2.approx.f(float %3388) #3, !dbg !191 + br label %__nv_exp2f.exit1386, !dbg !191 + +__nv_exp2f.exit1386: ; preds = %3485, %3487 + %.0.i1385 = phi float [ %3486, %3485 ], [ %3488, %3487 ], !dbg !191 + %3489 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1387 = icmp eq i32 %3489, 0, !dbg !191 + br i1 %.not.i1387, label %3492, label %3490, !dbg !191 + +3490: ; preds = %__nv_exp2f.exit1386 + %3491 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3389) #3, !dbg !191 + br label %__nv_exp2f.exit1389, !dbg !191 + +3492: ; preds = %__nv_exp2f.exit1386 + %3493 = tail call float @llvm.nvvm.ex2.approx.f(float %3389) #3, !dbg !191 + br label %__nv_exp2f.exit1389, !dbg !191 + +__nv_exp2f.exit1389: ; preds = %3490, %3492 + %.0.i1388 = phi float [ %3491, %3490 ], [ %3493, %3492 ], !dbg !191 + %3494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1390 = icmp eq i32 %3494, 0, !dbg !191 + br i1 %.not.i1390, label %3497, label %3495, !dbg !191 + +3495: ; preds = %__nv_exp2f.exit1389 + %3496 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3390) #3, !dbg !191 + br label %__nv_exp2f.exit1392, !dbg !191 + +3497: ; preds = %__nv_exp2f.exit1389 + %3498 = tail call float @llvm.nvvm.ex2.approx.f(float %3390) #3, !dbg !191 + br label %__nv_exp2f.exit1392, !dbg !191 + +__nv_exp2f.exit1392: ; preds = %3495, %3497 + %.0.i1391 = phi float [ %3496, %3495 ], [ %3498, %3497 ], !dbg !191 + %3499 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1393 = icmp eq i32 %3499, 0, !dbg !191 + br i1 %.not.i1393, label %3502, label %3500, !dbg !191 + +3500: ; preds = %__nv_exp2f.exit1392 + %3501 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3391) #3, !dbg !191 + br label %__nv_exp2f.exit1395, !dbg !191 + +3502: ; preds = %__nv_exp2f.exit1392 + %3503 = tail call float @llvm.nvvm.ex2.approx.f(float %3391) #3, !dbg !191 + br label %__nv_exp2f.exit1395, !dbg !191 + +__nv_exp2f.exit1395: ; preds = %3500, %3502 + %.0.i1394 = phi float [ %3501, %3500 ], [ %3503, %3502 ], !dbg !191 + %3504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1396 = icmp eq i32 %3504, 0, !dbg !191 + br i1 %.not.i1396, label %3507, label %3505, !dbg !191 + +3505: ; preds = %__nv_exp2f.exit1395 + %3506 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3392) #3, !dbg !191 + br label %__nv_exp2f.exit1398, !dbg !191 + +3507: ; preds = %__nv_exp2f.exit1395 + %3508 = tail call float @llvm.nvvm.ex2.approx.f(float %3392) #3, !dbg !191 + br label %__nv_exp2f.exit1398, !dbg !191 + +__nv_exp2f.exit1398: ; preds = %3505, %3507 + %.0.i1397 = phi float [ %3506, %3505 ], [ %3508, %3507 ], !dbg !191 + %3509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1399 = icmp eq i32 %3509, 0, !dbg !191 + br i1 %.not.i1399, label %3512, label %3510, !dbg !191 + +3510: ; preds = %__nv_exp2f.exit1398 + %3511 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3393) #3, !dbg !191 + br label %__nv_exp2f.exit1401, !dbg !191 + +3512: ; preds = %__nv_exp2f.exit1398 + %3513 = tail call float @llvm.nvvm.ex2.approx.f(float %3393) #3, !dbg !191 + br label %__nv_exp2f.exit1401, !dbg !191 + +__nv_exp2f.exit1401: ; preds = %3510, %3512 + %.0.i1400 = phi float [ %3511, %3510 ], [ %3513, %3512 ], !dbg !191 + %3514 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1402 = icmp eq i32 %3514, 0, !dbg !191 + br i1 %.not.i1402, label %3517, label %3515, !dbg !191 + +3515: ; preds = %__nv_exp2f.exit1401 + %3516 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3394) #3, !dbg !191 + br label %__nv_exp2f.exit1404, !dbg !191 + +3517: ; preds = %__nv_exp2f.exit1401 + %3518 = tail call float @llvm.nvvm.ex2.approx.f(float %3394) #3, !dbg !191 + br label %__nv_exp2f.exit1404, !dbg !191 + +__nv_exp2f.exit1404: ; preds = %3515, %3517 + %.0.i1403 = phi float [ %3516, %3515 ], [ %3518, %3517 ], !dbg !191 + %3519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1405 = icmp eq i32 %3519, 0, !dbg !191 + br i1 %.not.i1405, label %3522, label %3520, !dbg !191 + +3520: ; preds = %__nv_exp2f.exit1404 + %3521 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3395) #3, !dbg !191 + br label %__nv_exp2f.exit1407, !dbg !191 + +3522: ; preds = %__nv_exp2f.exit1404 + %3523 = tail call float @llvm.nvvm.ex2.approx.f(float %3395) #3, !dbg !191 + br label %__nv_exp2f.exit1407, !dbg !191 + +__nv_exp2f.exit1407: ; preds = %3520, %3522 + %.0.i1406 = phi float [ %3521, %3520 ], [ %3523, %3522 ], !dbg !191 + %3524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1408 = icmp eq i32 %3524, 0, !dbg !191 + br i1 %.not.i1408, label %3527, label %3525, !dbg !191 + +3525: ; preds = %__nv_exp2f.exit1407 + %3526 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3396) #3, !dbg !191 + br label %__nv_exp2f.exit1410, !dbg !191 + +3527: ; preds = %__nv_exp2f.exit1407 + %3528 = tail call float @llvm.nvvm.ex2.approx.f(float %3396) #3, !dbg !191 + br label %__nv_exp2f.exit1410, !dbg !191 + +__nv_exp2f.exit1410: ; preds = %3525, %3527 + %.0.i1409 = phi float [ %3526, %3525 ], [ %3528, %3527 ], !dbg !191 + %3529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1411 = icmp eq i32 %3529, 0, !dbg !191 + br i1 %.not.i1411, label %3532, label %3530, !dbg !191 + +3530: ; preds = %__nv_exp2f.exit1410 + %3531 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3397) #3, !dbg !191 + br label %__nv_exp2f.exit1413, !dbg !191 + +3532: ; preds = %__nv_exp2f.exit1410 + %3533 = tail call float @llvm.nvvm.ex2.approx.f(float %3397) #3, !dbg !191 + br label %__nv_exp2f.exit1413, !dbg !191 + +__nv_exp2f.exit1413: ; preds = %3530, %3532 + %.0.i1412 = phi float [ %3531, %3530 ], [ %3533, %3532 ], !dbg !191 + %3534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1414 = icmp eq i32 %3534, 0, !dbg !191 + br i1 %.not.i1414, label %3537, label %3535, !dbg !191 + +3535: ; preds = %__nv_exp2f.exit1413 + %3536 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3398) #3, !dbg !191 + br label %__nv_exp2f.exit1416, !dbg !191 + +3537: ; preds = %__nv_exp2f.exit1413 + %3538 = tail call float @llvm.nvvm.ex2.approx.f(float %3398) #3, !dbg !191 + br label %__nv_exp2f.exit1416, !dbg !191 + +__nv_exp2f.exit1416: ; preds = %3535, %3537 + %.0.i1415 = phi float [ %3536, %3535 ], [ %3538, %3537 ], !dbg !191 + %3539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1417 = icmp eq i32 %3539, 0, !dbg !191 + br i1 %.not.i1417, label %3542, label %3540, !dbg !191 + +3540: ; preds = %__nv_exp2f.exit1416 + %3541 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3399) #3, !dbg !191 + br label %__nv_exp2f.exit1419, !dbg !191 + +3542: ; preds = %__nv_exp2f.exit1416 + %3543 = tail call float @llvm.nvvm.ex2.approx.f(float %3399) #3, !dbg !191 + br label %__nv_exp2f.exit1419, !dbg !191 + +__nv_exp2f.exit1419: ; preds = %3540, %3542 + %.0.i1418 = phi float [ %3541, %3540 ], [ %3543, %3542 ], !dbg !191 + %3544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1420 = icmp eq i32 %3544, 0, !dbg !191 + br i1 %.not.i1420, label %3547, label %3545, !dbg !191 + +3545: ; preds = %__nv_exp2f.exit1419 + %3546 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3400) #3, !dbg !191 + br label %__nv_exp2f.exit1422, !dbg !191 + +3547: ; preds = %__nv_exp2f.exit1419 + %3548 = tail call float @llvm.nvvm.ex2.approx.f(float %3400) #3, !dbg !191 + br label %__nv_exp2f.exit1422, !dbg !191 + +__nv_exp2f.exit1422: ; preds = %3545, %3547 + %.0.i1421 = phi float [ %3546, %3545 ], [ %3548, %3547 ], !dbg !191 + %3549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1423 = icmp eq i32 %3549, 0, !dbg !191 + br i1 %.not.i1423, label %3552, label %3550, !dbg !191 + +3550: ; preds = %__nv_exp2f.exit1422 + %3551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3401) #3, !dbg !191 + br label %__nv_exp2f.exit1425, !dbg !191 + +3552: ; preds = %__nv_exp2f.exit1422 + %3553 = tail call float @llvm.nvvm.ex2.approx.f(float %3401) #3, !dbg !191 + br label %__nv_exp2f.exit1425, !dbg !191 + +__nv_exp2f.exit1425: ; preds = %3550, %3552 + %.0.i1424 = phi float [ %3551, %3550 ], [ %3553, %3552 ], !dbg !191 + %3554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1426 = icmp eq i32 %3554, 0, !dbg !191 + br i1 %.not.i1426, label %3557, label %3555, !dbg !191 + +3555: ; preds = %__nv_exp2f.exit1425 + %3556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3402) #3, !dbg !191 + br label %__nv_exp2f.exit1428, !dbg !191 + +3557: ; preds = %__nv_exp2f.exit1425 + %3558 = tail call float @llvm.nvvm.ex2.approx.f(float %3402) #3, !dbg !191 + br label %__nv_exp2f.exit1428, !dbg !191 + +__nv_exp2f.exit1428: ; preds = %3555, %3557 + %.0.i1427 = phi float [ %3556, %3555 ], [ %3558, %3557 ], !dbg !191 + %3559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1429 = icmp eq i32 %3559, 0, !dbg !191 + br i1 %.not.i1429, label %3562, label %3560, !dbg !191 + +3560: ; preds = %__nv_exp2f.exit1428 + %3561 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3403) #3, !dbg !191 + br label %__nv_exp2f.exit1431, !dbg !191 + +3562: ; preds = %__nv_exp2f.exit1428 + %3563 = tail call float @llvm.nvvm.ex2.approx.f(float %3403) #3, !dbg !191 + br label %__nv_exp2f.exit1431, !dbg !191 + +__nv_exp2f.exit1431: ; preds = %3560, %3562 + %.0.i1430 = phi float [ %3561, %3560 ], [ %3563, %3562 ], !dbg !191 + %3564 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2887, !dbg !182 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !192 + %3565 = add i32 %2891, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3566 = lshr exact i32 %3565, 4, !dbg !192 + %3567 = and i32 %3566, 16383, !dbg !192 + %3568 = zext nneg i32 %3567 to i64, !dbg !192 + %3569 = or disjoint i64 %3568, 4611686293372403712, !dbg !192 + %3570 = ptrtoint ptr addrspace(3) %3564 to i32, !dbg !192 + %3571 = lshr exact i32 %3570, 4, !dbg !192 + %3572 = and i32 %3571, 16383, !dbg !192 + %3573 = zext nneg i32 %3572 to i64, !dbg !192 + %3574 = or disjoint i64 %3573, 4611686293338849280, !dbg !192 + %3575 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %3569, i64 %3574) #3, !dbg !192 + %3576 = add i32 %2903, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3577 = lshr exact i32 %3576, 4, !dbg !192 + %3578 = and i32 %3577, 16383, !dbg !192 + %3579 = zext nneg i32 %3578 to i64, !dbg !192 + %3580 = or disjoint i64 %3579, 4611686293372403712, !dbg !192 + %3581 = add i32 %3570, 32, !dbg !192 + %3582 = lshr exact i32 %3581, 4, !dbg !192 + %3583 = and i32 %3582, 16383, !dbg !192 + %3584 = zext nneg i32 %3583 to i64, !dbg !192 + %3585 = or disjoint i64 %3584, 4611686293338849280, !dbg !192 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 0, !dbg !192 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 1, !dbg !192 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 2, !dbg !192 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 3, !dbg !192 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 4, !dbg !192 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 5, !dbg !192 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 6, !dbg !192 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 7, !dbg !192 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 8, !dbg !192 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 9, !dbg !192 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 10, !dbg !192 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 11, !dbg !192 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 12, !dbg !192 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 13, !dbg !192 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 14, !dbg !192 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 15, !dbg !192 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 16, !dbg !192 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 17, !dbg !192 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 18, !dbg !192 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 19, !dbg !192 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 20, !dbg !192 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 21, !dbg !192 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 22, !dbg !192 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 23, !dbg !192 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 24, !dbg !192 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 25, !dbg !192 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 26, !dbg !192 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 27, !dbg !192 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 28, !dbg !192 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 29, !dbg !192 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 30, !dbg !192 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 31, !dbg !192 + %3618 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3586, float %3587, float %3588, float %3589, float %3590, float %3591, float %3592, float %3593, float %3594, float %3595, float %3596, float %3597, float %3598, float %3599, float %3600, float %3601, float %3602, float %3603, float %3604, float %3605, float %3606, float %3607, float %3608, float %3609, float %3610, float %3611, float %3612, float %3613, float %3614, float %3615, float %3616, float %3617, i64 %3580, i64 %3585, i1 true) #3, !dbg !192 + %3619 = add i32 %2947, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3620 = lshr exact i32 %3619, 4, !dbg !192 + %3621 = and i32 %3620, 16383, !dbg !192 + %3622 = zext nneg i32 %3621 to i64, !dbg !192 + %3623 = or disjoint i64 %3622, 4611686293372403712, !dbg !192 + %3624 = add i32 %3570, 64, !dbg !192 + %3625 = lshr exact i32 %3624, 4, !dbg !192 + %3626 = and i32 %3625, 16383, !dbg !192 + %3627 = zext nneg i32 %3626 to i64, !dbg !192 + %3628 = or disjoint i64 %3627, 4611686293338849280, !dbg !192 + %3629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 0, !dbg !192 + %3630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 1, !dbg !192 + %3631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 2, !dbg !192 + %3632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 3, !dbg !192 + %3633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 4, !dbg !192 + %3634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 5, !dbg !192 + %3635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 6, !dbg !192 + %3636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 7, !dbg !192 + %3637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 8, !dbg !192 + %3638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 9, !dbg !192 + %3639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 10, !dbg !192 + %3640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 11, !dbg !192 + %3641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 12, !dbg !192 + %3642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 13, !dbg !192 + %3643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 14, !dbg !192 + %3644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 15, !dbg !192 + %3645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 16, !dbg !192 + %3646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 17, !dbg !192 + %3647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 18, !dbg !192 + %3648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 19, !dbg !192 + %3649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 20, !dbg !192 + %3650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 21, !dbg !192 + %3651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 22, !dbg !192 + %3652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 23, !dbg !192 + %3653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 24, !dbg !192 + %3654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 25, !dbg !192 + %3655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 26, !dbg !192 + %3656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 27, !dbg !192 + %3657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 28, !dbg !192 + %3658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 29, !dbg !192 + %3659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 30, !dbg !192 + %3660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 31, !dbg !192 + %3661 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3629, float %3630, float %3631, float %3632, float %3633, float %3634, float %3635, float %3636, float %3637, float %3638, float %3639, float %3640, float %3641, float %3642, float %3643, float %3644, float %3645, float %3646, float %3647, float %3648, float %3649, float %3650, float %3651, float %3652, float %3653, float %3654, float %3655, float %3656, float %3657, float %3658, float %3659, float %3660, i64 %3623, i64 %3628, i1 true) #3, !dbg !192 + %3662 = add i32 %2991, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3663 = lshr exact i32 %3662, 4, !dbg !192 + %3664 = and i32 %3663, 16383, !dbg !192 + %3665 = zext nneg i32 %3664 to i64, !dbg !192 + %3666 = or disjoint i64 %3665, 4611686293372403712, !dbg !192 + %3667 = add i32 %3570, 96, !dbg !192 + %3668 = lshr exact i32 %3667, 4, !dbg !192 + %3669 = and i32 %3668, 16383, !dbg !192 + %3670 = zext nneg i32 %3669 to i64, !dbg !192 + %3671 = or disjoint i64 %3670, 4611686293338849280, !dbg !192 + %3672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 0, !dbg !192 + %3673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 1, !dbg !192 + %3674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 2, !dbg !192 + %3675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 3, !dbg !192 + %3676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 4, !dbg !192 + %3677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 5, !dbg !192 + %3678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 6, !dbg !192 + %3679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 7, !dbg !192 + %3680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 8, !dbg !192 + %3681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 9, !dbg !192 + %3682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 10, !dbg !192 + %3683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 11, !dbg !192 + %3684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 12, !dbg !192 + %3685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 13, !dbg !192 + %3686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 14, !dbg !192 + %3687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 15, !dbg !192 + %3688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 16, !dbg !192 + %3689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 17, !dbg !192 + %3690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 18, !dbg !192 + %3691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 19, !dbg !192 + %3692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 20, !dbg !192 + %3693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 21, !dbg !192 + %3694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 22, !dbg !192 + %3695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 23, !dbg !192 + %3696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 24, !dbg !192 + %3697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 25, !dbg !192 + %3698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 26, !dbg !192 + %3699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 27, !dbg !192 + %3700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 28, !dbg !192 + %3701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 29, !dbg !192 + %3702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 30, !dbg !192 + %3703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 31, !dbg !192 + %3704 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3672, float %3673, float %3674, float %3675, float %3676, float %3677, float %3678, float %3679, float %3680, float %3681, float %3682, float %3683, float %3684, float %3685, float %3686, float %3687, float %3688, float %3689, float %3690, float %3691, float %3692, float %3693, float %3694, float %3695, float %3696, float %3697, float %3698, float %3699, float %3700, float %3701, float %3702, float %3703, i64 %3666, i64 %3671, i1 true) #3, !dbg !192 + %3705 = add i32 %3035, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3706 = lshr exact i32 %3705, 4, !dbg !192 + %3707 = and i32 %3706, 16383, !dbg !192 + %3708 = zext nneg i32 %3707 to i64, !dbg !192 + %3709 = or disjoint i64 %3708, 4611686293372403712, !dbg !192 + %3710 = add i32 %3570, 8192, !dbg !192 + %3711 = lshr exact i32 %3710, 4, !dbg !192 + %3712 = and i32 %3711, 16383, !dbg !192 + %3713 = zext nneg i32 %3712 to i64, !dbg !192 + %3714 = or disjoint i64 %3713, 4611686293338849280, !dbg !192 + %3715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 0, !dbg !192 + %3716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 1, !dbg !192 + %3717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 2, !dbg !192 + %3718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 3, !dbg !192 + %3719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 4, !dbg !192 + %3720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 5, !dbg !192 + %3721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 6, !dbg !192 + %3722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 7, !dbg !192 + %3723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 8, !dbg !192 + %3724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 9, !dbg !192 + %3725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 10, !dbg !192 + %3726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 11, !dbg !192 + %3727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 12, !dbg !192 + %3728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 13, !dbg !192 + %3729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 14, !dbg !192 + %3730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 15, !dbg !192 + %3731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 16, !dbg !192 + %3732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 17, !dbg !192 + %3733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 18, !dbg !192 + %3734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 19, !dbg !192 + %3735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 20, !dbg !192 + %3736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 21, !dbg !192 + %3737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 22, !dbg !192 + %3738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 23, !dbg !192 + %3739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 24, !dbg !192 + %3740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 25, !dbg !192 + %3741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 26, !dbg !192 + %3742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 27, !dbg !192 + %3743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 28, !dbg !192 + %3744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 29, !dbg !192 + %3745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 30, !dbg !192 + %3746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 31, !dbg !192 + %3747 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3715, float %3716, float %3717, float %3718, float %3719, float %3720, float %3721, float %3722, float %3723, float %3724, float %3725, float %3726, float %3727, float %3728, float %3729, float %3730, float %3731, float %3732, float %3733, float %3734, float %3735, float %3736, float %3737, float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, i64 %3709, i64 %3714, i1 true) #3, !dbg !192 + %3748 = add i32 %3079, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3749 = lshr exact i32 %3748, 4, !dbg !192 + %3750 = and i32 %3749, 16383, !dbg !192 + %3751 = zext nneg i32 %3750 to i64, !dbg !192 + %3752 = or disjoint i64 %3751, 4611686293372403712, !dbg !192 + %3753 = add i32 %3570, 8224, !dbg !192 + %3754 = lshr exact i32 %3753, 4, !dbg !192 + %3755 = and i32 %3754, 16383, !dbg !192 + %3756 = zext nneg i32 %3755 to i64, !dbg !192 + %3757 = or disjoint i64 %3756, 4611686293338849280, !dbg !192 + %3758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 0, !dbg !192 + %3759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 1, !dbg !192 + %3760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 2, !dbg !192 + %3761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 3, !dbg !192 + %3762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 4, !dbg !192 + %3763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 5, !dbg !192 + %3764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 6, !dbg !192 + %3765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 7, !dbg !192 + %3766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 8, !dbg !192 + %3767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 9, !dbg !192 + %3768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 10, !dbg !192 + %3769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 11, !dbg !192 + %3770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 12, !dbg !192 + %3771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 13, !dbg !192 + %3772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 14, !dbg !192 + %3773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 15, !dbg !192 + %3774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 16, !dbg !192 + %3775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 17, !dbg !192 + %3776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 18, !dbg !192 + %3777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 19, !dbg !192 + %3778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 20, !dbg !192 + %3779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 21, !dbg !192 + %3780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 22, !dbg !192 + %3781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 23, !dbg !192 + %3782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 24, !dbg !192 + %3783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 25, !dbg !192 + %3784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 26, !dbg !192 + %3785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 27, !dbg !192 + %3786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 28, !dbg !192 + %3787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 29, !dbg !192 + %3788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 30, !dbg !192 + %3789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 31, !dbg !192 + %3790 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778, float %3779, float %3780, float %3781, float %3782, float %3783, float %3784, float %3785, float %3786, float %3787, float %3788, float %3789, i64 %3752, i64 %3757, i1 true) #3, !dbg !192 + %3791 = add i32 %3123, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3792 = lshr exact i32 %3791, 4, !dbg !192 + %3793 = and i32 %3792, 16383, !dbg !192 + %3794 = zext nneg i32 %3793 to i64, !dbg !192 + %3795 = or disjoint i64 %3794, 4611686293372403712, !dbg !192 + %3796 = add i32 %3570, 8256, !dbg !192 + %3797 = lshr exact i32 %3796, 4, !dbg !192 + %3798 = and i32 %3797, 16383, !dbg !192 + %3799 = zext nneg i32 %3798 to i64, !dbg !192 + %3800 = or disjoint i64 %3799, 4611686293338849280, !dbg !192 + %3801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 0, !dbg !192 + %3802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 1, !dbg !192 + %3803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 2, !dbg !192 + %3804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 3, !dbg !192 + %3805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 4, !dbg !192 + %3806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 5, !dbg !192 + %3807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 6, !dbg !192 + %3808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 7, !dbg !192 + %3809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 8, !dbg !192 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 9, !dbg !192 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 10, !dbg !192 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 11, !dbg !192 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 12, !dbg !192 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 13, !dbg !192 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 14, !dbg !192 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 15, !dbg !192 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 16, !dbg !192 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 17, !dbg !192 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 18, !dbg !192 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 19, !dbg !192 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 20, !dbg !192 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 21, !dbg !192 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 22, !dbg !192 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 23, !dbg !192 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 24, !dbg !192 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 25, !dbg !192 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 26, !dbg !192 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 27, !dbg !192 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 28, !dbg !192 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 29, !dbg !192 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 30, !dbg !192 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 31, !dbg !192 + %3833 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3801, float %3802, float %3803, float %3804, float %3805, float %3806, float %3807, float %3808, float %3809, float %3810, float %3811, float %3812, float %3813, float %3814, float %3815, float %3816, float %3817, float %3818, float %3819, float %3820, float %3821, float %3822, float %3823, float %3824, float %3825, float %3826, float %3827, float %3828, float %3829, float %3830, float %3831, float %3832, i64 %3795, i64 %3800, i1 true) #3, !dbg !192 + %3834 = add i32 %3167, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3835 = lshr exact i32 %3834, 4, !dbg !192 + %3836 = and i32 %3835, 16383, !dbg !192 + %3837 = zext nneg i32 %3836 to i64, !dbg !192 + %3838 = or disjoint i64 %3837, 4611686293372403712, !dbg !192 + %3839 = add i32 %3570, 8288, !dbg !192 + %3840 = lshr exact i32 %3839, 4, !dbg !192 + %3841 = and i32 %3840, 16383, !dbg !192 + %3842 = zext nneg i32 %3841 to i64, !dbg !192 + %3843 = or disjoint i64 %3842, 4611686293338849280, !dbg !192 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 0, !dbg !192 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 1, !dbg !192 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 2, !dbg !192 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 3, !dbg !192 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 4, !dbg !192 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 5, !dbg !192 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 6, !dbg !192 + %3851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 7, !dbg !192 + %3852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 8, !dbg !192 + %3853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 9, !dbg !192 + %3854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 10, !dbg !192 + %3855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 11, !dbg !192 + %3856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 12, !dbg !192 + %3857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 13, !dbg !192 + %3858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 14, !dbg !192 + %3859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 15, !dbg !192 + %3860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 16, !dbg !192 + %3861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 17, !dbg !192 + %3862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 18, !dbg !192 + %3863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 19, !dbg !192 + %3864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 20, !dbg !192 + %3865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 21, !dbg !192 + %3866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 22, !dbg !192 + %3867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 23, !dbg !192 + %3868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 24, !dbg !192 + %3869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 25, !dbg !192 + %3870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 26, !dbg !192 + %3871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 27, !dbg !192 + %3872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 28, !dbg !192 + %3873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 29, !dbg !192 + %3874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 30, !dbg !192 + %3875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 31, !dbg !192 + %3876 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3844, float %3845, float %3846, float %3847, float %3848, float %3849, float %3850, float %3851, float %3852, float %3853, float %3854, float %3855, float %3856, float %3857, float %3858, float %3859, float %3860, float %3861, float %3862, float %3863, float %3864, float %3865, float %3866, float %3867, float %3868, float %3869, float %3870, float %3871, float %3872, float %3873, float %3874, float %3875, i64 %3838, i64 %3843, i1 true) #3, !dbg !192 + %3877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 0, !dbg !192 + %3878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 1, !dbg !192 + %3879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 2, !dbg !192 + %3880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 3, !dbg !192 + %3881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 4, !dbg !192 + %3882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 5, !dbg !192 + %3883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 6, !dbg !192 + %3884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 7, !dbg !192 + %3885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 8, !dbg !192 + %3886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 9, !dbg !192 + %3887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 10, !dbg !192 + %3888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 11, !dbg !192 + %3889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 12, !dbg !192 + %3890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 13, !dbg !192 + %3891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 14, !dbg !192 + %3892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 15, !dbg !192 + %3893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 16, !dbg !192 + %3894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 17, !dbg !192 + %3895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 18, !dbg !192 + %3896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 19, !dbg !192 + %3897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 20, !dbg !192 + %3898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 21, !dbg !192 + %3899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 22, !dbg !192 + %3900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 23, !dbg !192 + %3901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 24, !dbg !192 + %3902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 25, !dbg !192 + %3903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 26, !dbg !192 + %3904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 27, !dbg !192 + %3905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 28, !dbg !192 + %3906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 29, !dbg !192 + %3907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 30, !dbg !192 + %3908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 31, !dbg !192 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !192 + %3909 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3877, float %3878, float %3879, float %3880, float %3881, float %3882, float %3883, float %3884, float %3885, float %3886, float %3887, float %3888, float %3889, float %3890, float %3891, float %3892, float %3893, float %3894, float %3895, float %3896, float %3897, float %3898, float %3899, float %3900, float %3901, float %3902, float %3903, float %3904, float %3905, float %3906, float %3907, float %3908, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %3564, i32 0, i32 0) #3, !dbg !192 + %3910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 0, !dbg !192 + %3911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 1, !dbg !192 + %3912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 2, !dbg !192 + %3913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 3, !dbg !192 + %3914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 4, !dbg !192 + %3915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 5, !dbg !192 + %3916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 6, !dbg !192 + %3917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 7, !dbg !192 + %3918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 8, !dbg !192 + %3919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 9, !dbg !192 + %3920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 10, !dbg !192 + %3921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 11, !dbg !192 + %3922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 12, !dbg !192 + %3923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 13, !dbg !192 + %3924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 14, !dbg !192 + %3925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 15, !dbg !192 + %3926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 16, !dbg !192 + %3927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 17, !dbg !192 + %3928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 18, !dbg !192 + %3929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 19, !dbg !192 + %3930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 20, !dbg !192 + %3931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 21, !dbg !192 + %3932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 22, !dbg !192 + %3933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 23, !dbg !192 + %3934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 24, !dbg !192 + %3935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 25, !dbg !192 + %3936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 26, !dbg !192 + %3937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 27, !dbg !192 + %3938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 28, !dbg !192 + %3939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 29, !dbg !192 + %3940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 30, !dbg !192 + %3941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 31, !dbg !192 + %3942 = fsub float %3910, %370, !dbg !193 + %3943 = fsub float %3911, %370, !dbg !193 + %3944 = fsub float %3912, %372, !dbg !193 + %3945 = fsub float %3913, %372, !dbg !193 + %3946 = fsub float %3914, %370, !dbg !193 + %3947 = fsub float %3915, %370, !dbg !193 + %3948 = fsub float %3916, %372, !dbg !193 + %3949 = fsub float %3917, %372, !dbg !193 + %3950 = fsub float %3918, %370, !dbg !193 + %3951 = fsub float %3919, %370, !dbg !193 + %3952 = fsub float %3920, %372, !dbg !193 + %3953 = fsub float %3921, %372, !dbg !193 + %3954 = fsub float %3922, %370, !dbg !193 + %3955 = fsub float %3923, %370, !dbg !193 + %3956 = fsub float %3924, %372, !dbg !193 + %3957 = fsub float %3925, %372, !dbg !193 + %3958 = fsub float %3926, %370, !dbg !193 + %3959 = fsub float %3927, %370, !dbg !193 + %3960 = fsub float %3928, %372, !dbg !193 + %3961 = fsub float %3929, %372, !dbg !193 + %3962 = fsub float %3930, %370, !dbg !193 + %3963 = fsub float %3931, %370, !dbg !193 + %3964 = fsub float %3932, %372, !dbg !193 + %3965 = fsub float %3933, %372, !dbg !193 + %3966 = fsub float %3934, %370, !dbg !193 + %3967 = fsub float %3935, %370, !dbg !193 + %3968 = fsub float %3936, %372, !dbg !193 + %3969 = fsub float %3937, %372, !dbg !193 + %3970 = fsub float %3938, %370, !dbg !193 + %3971 = fsub float %3939, %370, !dbg !193 + %3972 = fsub float %3940, %372, !dbg !193 + %3973 = fsub float %3941, %372, !dbg !193 + %3974 = fmul float %.0.i1337, %3942, !dbg !194 + %3975 = fmul float %.0.i1340, %3943, !dbg !194 + %3976 = fmul float %.0.i1343, %3944, !dbg !194 + %3977 = fmul float %.0.i1346, %3945, !dbg !194 + %3978 = fmul float %.0.i1349, %3946, !dbg !194 + %3979 = fmul float %.0.i1352, %3947, !dbg !194 + %3980 = fmul float %.0.i1355, %3948, !dbg !194 + %3981 = fmul float %.0.i1358, %3949, !dbg !194 + %3982 = fmul float %.0.i1361, %3950, !dbg !194 + %3983 = fmul float %.0.i1364, %3951, !dbg !194 + %3984 = fmul float %.0.i1367, %3952, !dbg !194 + %3985 = fmul float %.0.i1370, %3953, !dbg !194 + %3986 = fmul float %.0.i1373, %3954, !dbg !194 + %3987 = fmul float %.0.i1376, %3955, !dbg !194 + %3988 = fmul float %.0.i1379, %3956, !dbg !194 + %3989 = fmul float %.0.i1382, %3957, !dbg !194 + %3990 = fmul float %.0.i1385, %3958, !dbg !194 + %3991 = fmul float %.0.i1388, %3959, !dbg !194 + %3992 = fmul float %.0.i1391, %3960, !dbg !194 + %3993 = fmul float %.0.i1394, %3961, !dbg !194 + %3994 = fmul float %.0.i1397, %3962, !dbg !194 + %3995 = fmul float %.0.i1400, %3963, !dbg !194 + %3996 = fmul float %.0.i1403, %3964, !dbg !194 + %3997 = fmul float %.0.i1406, %3965, !dbg !194 + %3998 = fmul float %.0.i1409, %3966, !dbg !194 + %3999 = fmul float %.0.i1412, %3967, !dbg !194 + %4000 = fmul float %.0.i1415, %3968, !dbg !194 + %4001 = fmul float %.0.i1418, %3969, !dbg !194 + %4002 = fmul float %.0.i1421, %3970, !dbg !194 + %4003 = fmul float %.0.i1424, %3971, !dbg !194 + %4004 = fmul float %.0.i1427, %3972, !dbg !194 + %4005 = fmul float %.0.i1430, %3973, !dbg !194 + %4006 = fptrunc float %3974 to bfloat, !dbg !195 + %4007 = select i1 %2856, bfloat %4006, bfloat 0xR0000, !dbg !196 + %4008 = fptrunc float %3975 to bfloat, !dbg !195 + %4009 = select i1 %2858, bfloat %4008, bfloat 0xR0000, !dbg !196 + %4010 = fptrunc float %3976 to bfloat, !dbg !195 + %4011 = select i1 %2856, bfloat %4010, bfloat 0xR0000, !dbg !196 + %4012 = fptrunc float %3977 to bfloat, !dbg !195 + %4013 = select i1 %2858, bfloat %4012, bfloat 0xR0000, !dbg !196 + %4014 = fptrunc float %3978 to bfloat, !dbg !195 + %4015 = select i1 %2860, bfloat %4014, bfloat 0xR0000, !dbg !196 + %4016 = fptrunc float %3979 to bfloat, !dbg !195 + %4017 = select i1 %2862, bfloat %4016, bfloat 0xR0000, !dbg !196 + %4018 = fptrunc float %3980 to bfloat, !dbg !195 + %4019 = select i1 %2860, bfloat %4018, bfloat 0xR0000, !dbg !196 + %4020 = fptrunc float %3981 to bfloat, !dbg !195 + %4021 = select i1 %2862, bfloat %4020, bfloat 0xR0000, !dbg !196 + %4022 = fptrunc float %3982 to bfloat, !dbg !195 + %4023 = select i1 %2864, bfloat %4022, bfloat 0xR0000, !dbg !196 + %4024 = fptrunc float %3983 to bfloat, !dbg !195 + %4025 = select i1 %2866, bfloat %4024, bfloat 0xR0000, !dbg !196 + %4026 = fptrunc float %3984 to bfloat, !dbg !195 + %4027 = select i1 %2864, bfloat %4026, bfloat 0xR0000, !dbg !196 + %4028 = fptrunc float %3985 to bfloat, !dbg !195 + %4029 = select i1 %2866, bfloat %4028, bfloat 0xR0000, !dbg !196 + %4030 = fptrunc float %3986 to bfloat, !dbg !195 + %4031 = select i1 %2868, bfloat %4030, bfloat 0xR0000, !dbg !196 + %4032 = fptrunc float %3987 to bfloat, !dbg !195 + %4033 = select i1 %2870, bfloat %4032, bfloat 0xR0000, !dbg !196 + %4034 = fptrunc float %3988 to bfloat, !dbg !195 + %4035 = select i1 %2868, bfloat %4034, bfloat 0xR0000, !dbg !196 + %4036 = fptrunc float %3989 to bfloat, !dbg !195 + %4037 = select i1 %2870, bfloat %4036, bfloat 0xR0000, !dbg !196 + %4038 = fptrunc float %3990 to bfloat, !dbg !195 + %4039 = select i1 %2872, bfloat %4038, bfloat 0xR0000, !dbg !196 + %4040 = fptrunc float %3991 to bfloat, !dbg !195 + %4041 = select i1 %2874, bfloat %4040, bfloat 0xR0000, !dbg !196 + %4042 = fptrunc float %3992 to bfloat, !dbg !195 + %4043 = select i1 %2872, bfloat %4042, bfloat 0xR0000, !dbg !196 + %4044 = fptrunc float %3993 to bfloat, !dbg !195 + %4045 = select i1 %2874, bfloat %4044, bfloat 0xR0000, !dbg !196 + %4046 = fptrunc float %3994 to bfloat, !dbg !195 + %4047 = select i1 %2876, bfloat %4046, bfloat 0xR0000, !dbg !196 + %4048 = fptrunc float %3995 to bfloat, !dbg !195 + %4049 = select i1 %2878, bfloat %4048, bfloat 0xR0000, !dbg !196 + %4050 = fptrunc float %3996 to bfloat, !dbg !195 + %4051 = select i1 %2876, bfloat %4050, bfloat 0xR0000, !dbg !196 + %4052 = fptrunc float %3997 to bfloat, !dbg !195 + %4053 = select i1 %2878, bfloat %4052, bfloat 0xR0000, !dbg !196 + %4054 = fptrunc float %3998 to bfloat, !dbg !195 + %4055 = select i1 %2880, bfloat %4054, bfloat 0xR0000, !dbg !196 + %4056 = fptrunc float %3999 to bfloat, !dbg !195 + %4057 = select i1 %2882, bfloat %4056, bfloat 0xR0000, !dbg !196 + %4058 = fptrunc float %4000 to bfloat, !dbg !195 + %4059 = select i1 %2880, bfloat %4058, bfloat 0xR0000, !dbg !196 + %4060 = fptrunc float %4001 to bfloat, !dbg !195 + %4061 = select i1 %2882, bfloat %4060, bfloat 0xR0000, !dbg !196 + %4062 = fptrunc float %4002 to bfloat, !dbg !195 + %4063 = select i1 %2884, bfloat %4062, bfloat 0xR0000, !dbg !196 + %4064 = fptrunc float %4003 to bfloat, !dbg !195 + %4065 = select i1 %2886, bfloat %4064, bfloat 0xR0000, !dbg !196 + %4066 = fptrunc float %4004 to bfloat, !dbg !195 + %4067 = select i1 %2884, bfloat %4066, bfloat 0xR0000, !dbg !196 + %4068 = fptrunc float %4005 to bfloat, !dbg !195 + %4069 = select i1 %2886, bfloat %4068, bfloat 0xR0000, !dbg !196 + %4070 = insertelement <2 x bfloat> poison, bfloat %4007, i64 0, !dbg !197 + %4071 = insertelement <2 x bfloat> %4070, bfloat %4009, i64 1, !dbg !197 + %4072 = bitcast <2 x bfloat> %4071 to i32, !dbg !197 + %4073 = insertelement <2 x bfloat> poison, bfloat %4011, i64 0, !dbg !197 + %4074 = insertelement <2 x bfloat> %4073, bfloat %4013, i64 1, !dbg !197 + %4075 = bitcast <2 x bfloat> %4074 to i32, !dbg !197 + %4076 = insertelement <2 x bfloat> poison, bfloat %4015, i64 0, !dbg !197 + %4077 = insertelement <2 x bfloat> %4076, bfloat %4017, i64 1, !dbg !197 + %4078 = bitcast <2 x bfloat> %4077 to i32, !dbg !197 + %4079 = insertelement <2 x bfloat> poison, bfloat %4019, i64 0, !dbg !197 + %4080 = insertelement <2 x bfloat> %4079, bfloat %4021, i64 1, !dbg !197 + %4081 = bitcast <2 x bfloat> %4080 to i32, !dbg !197 + %4082 = insertelement <2 x bfloat> poison, bfloat %4023, i64 0, !dbg !197 + %4083 = insertelement <2 x bfloat> %4082, bfloat %4025, i64 1, !dbg !197 + %4084 = bitcast <2 x bfloat> %4083 to i32, !dbg !197 + %4085 = insertelement <2 x bfloat> poison, bfloat %4027, i64 0, !dbg !197 + %4086 = insertelement <2 x bfloat> %4085, bfloat %4029, i64 1, !dbg !197 + %4087 = bitcast <2 x bfloat> %4086 to i32, !dbg !197 + %4088 = insertelement <2 x bfloat> poison, bfloat %4031, i64 0, !dbg !197 + %4089 = insertelement <2 x bfloat> %4088, bfloat %4033, i64 1, !dbg !197 + %4090 = bitcast <2 x bfloat> %4089 to i32, !dbg !197 + %4091 = insertelement <2 x bfloat> poison, bfloat %4035, i64 0, !dbg !197 + %4092 = insertelement <2 x bfloat> %4091, bfloat %4037, i64 1, !dbg !197 + %4093 = bitcast <2 x bfloat> %4092 to i32, !dbg !197 + %4094 = insertelement <2 x bfloat> poison, bfloat %4039, i64 0, !dbg !197 + %4095 = insertelement <2 x bfloat> %4094, bfloat %4041, i64 1, !dbg !197 + %4096 = bitcast <2 x bfloat> %4095 to i32, !dbg !197 + %4097 = insertelement <2 x bfloat> poison, bfloat %4043, i64 0, !dbg !197 + %4098 = insertelement <2 x bfloat> %4097, bfloat %4045, i64 1, !dbg !197 + %4099 = bitcast <2 x bfloat> %4098 to i32, !dbg !197 + %4100 = insertelement <2 x bfloat> poison, bfloat %4047, i64 0, !dbg !197 + %4101 = insertelement <2 x bfloat> %4100, bfloat %4049, i64 1, !dbg !197 + %4102 = bitcast <2 x bfloat> %4101 to i32, !dbg !197 + %4103 = insertelement <2 x bfloat> poison, bfloat %4051, i64 0, !dbg !197 + %4104 = insertelement <2 x bfloat> %4103, bfloat %4053, i64 1, !dbg !197 + %4105 = bitcast <2 x bfloat> %4104 to i32, !dbg !197 + %4106 = insertelement <2 x bfloat> poison, bfloat %4055, i64 0, !dbg !197 + %4107 = insertelement <2 x bfloat> %4106, bfloat %4057, i64 1, !dbg !197 + %4108 = bitcast <2 x bfloat> %4107 to i32, !dbg !197 + %4109 = insertelement <2 x bfloat> poison, bfloat %4059, i64 0, !dbg !197 + %4110 = insertelement <2 x bfloat> %4109, bfloat %4061, i64 1, !dbg !197 + %4111 = bitcast <2 x bfloat> %4110 to i32, !dbg !197 + %4112 = insertelement <2 x bfloat> poison, bfloat %4063, i64 0, !dbg !197 + %4113 = insertelement <2 x bfloat> %4112, bfloat %4065, i64 1, !dbg !197 + %4114 = bitcast <2 x bfloat> %4113 to i32, !dbg !197 + %4115 = insertelement <2 x bfloat> poison, bfloat %4067, i64 0, !dbg !197 + %4116 = insertelement <2 x bfloat> %4115, bfloat %4069, i64 1, !dbg !197 + %4117 = bitcast <2 x bfloat> %4116 to i32, !dbg !197 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !197 + %4118 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn, float %.pn2450, float %.pn2451, float %.pn2452, float %.pn2453, float %.pn2454, float %.pn2455, float %.pn2456, float %.pn2457, float %.pn2458, float %.pn2459, float %.pn2460, float %.pn2461, float %.pn2462, float %.pn2463, float %.pn2464, float %.pn2465, float %.pn2466, float %.pn2467, float %.pn2468, float %.pn2469, float %.pn2470, float %.pn2471, float %.pn2472, float %.pn2473, float %.pn2474, float %.pn2475, float %.pn2476, float %.pn2477, float %.pn2478, float %.pn2479, float %.pn2480, float %.pn2481, float %.pn2482, float %.pn2483, float %.pn2484, float %.pn2485, float %.pn2486, float %.pn2487, float %.pn2488, float %.pn2489, float %.pn2490, float %.pn2491, float %.pn2492, float %.pn2493, float %.pn2494, float %.pn2495, float %.pn2496, float %.pn2497, float %.pn2498, float %.pn2499, float %.pn2500, float %.pn2501, float %.pn2502, float %.pn2503, float %.pn2504, float %.pn2505, float %.pn2506, float %.pn2507, float %.pn2508, float %.pn2509, float %.pn2510, float %.pn2511, float %.pn2512, i32 %4072, i32 %4075, i32 %4078, i32 %4081, i64 %2901, i1 true) #3, !dbg !197 + %4119 = add i32 %2897, 2048, !dbg !197 + %4120 = lshr exact i32 %4119, 4, !dbg !197 + %4121 = and i32 %4120, 16383, !dbg !197 + %4122 = zext nneg i32 %4121 to i64, !dbg !197 + %4123 = or disjoint i64 %4122, 4611686293338849280, !dbg !197 + %4124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 0, !dbg !197 + %4125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 1, !dbg !197 + %4126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 2, !dbg !197 + %4127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 3, !dbg !197 + %4128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 4, !dbg !197 + %4129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 5, !dbg !197 + %4130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 6, !dbg !197 + %4131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 7, !dbg !197 + %4132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 8, !dbg !197 + %4133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 9, !dbg !197 + %4134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 10, !dbg !197 + %4135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 11, !dbg !197 + %4136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 12, !dbg !197 + %4137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 13, !dbg !197 + %4138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 14, !dbg !197 + %4139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 15, !dbg !197 + %4140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 16, !dbg !197 + %4141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 17, !dbg !197 + %4142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 18, !dbg !197 + %4143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 19, !dbg !197 + %4144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 20, !dbg !197 + %4145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 21, !dbg !197 + %4146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 22, !dbg !197 + %4147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 23, !dbg !197 + %4148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 24, !dbg !197 + %4149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 25, !dbg !197 + %4150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 26, !dbg !197 + %4151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 27, !dbg !197 + %4152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 28, !dbg !197 + %4153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 29, !dbg !197 + %4154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 30, !dbg !197 + %4155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 31, !dbg !197 + %4156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 32, !dbg !197 + %4157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 33, !dbg !197 + %4158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 34, !dbg !197 + %4159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 35, !dbg !197 + %4160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 36, !dbg !197 + %4161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 37, !dbg !197 + %4162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 38, !dbg !197 + %4163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 39, !dbg !197 + %4164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 40, !dbg !197 + %4165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 41, !dbg !197 + %4166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 42, !dbg !197 + %4167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 43, !dbg !197 + %4168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 44, !dbg !197 + %4169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 45, !dbg !197 + %4170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 46, !dbg !197 + %4171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 47, !dbg !197 + %4172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 48, !dbg !197 + %4173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 49, !dbg !197 + %4174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 50, !dbg !197 + %4175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 51, !dbg !197 + %4176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 52, !dbg !197 + %4177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 53, !dbg !197 + %4178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 54, !dbg !197 + %4179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 55, !dbg !197 + %4180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 56, !dbg !197 + %4181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 57, !dbg !197 + %4182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 58, !dbg !197 + %4183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 59, !dbg !197 + %4184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 60, !dbg !197 + %4185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 61, !dbg !197 + %4186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 62, !dbg !197 + %4187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 63, !dbg !197 + %4188 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4124, float %4125, float %4126, float %4127, float %4128, float %4129, float %4130, float %4131, float %4132, float %4133, float %4134, float %4135, float %4136, float %4137, float %4138, float %4139, float %4140, float %4141, float %4142, float %4143, float %4144, float %4145, float %4146, float %4147, float %4148, float %4149, float %4150, float %4151, float %4152, float %4153, float %4154, float %4155, float %4156, float %4157, float %4158, float %4159, float %4160, float %4161, float %4162, float %4163, float %4164, float %4165, float %4166, float %4167, float %4168, float %4169, float %4170, float %4171, float %4172, float %4173, float %4174, float %4175, float %4176, float %4177, float %4178, float %4179, float %4180, float %4181, float %4182, float %4183, float %4184, float %4185, float %4186, float %4187, i32 %4084, i32 %4087, i32 %4090, i32 %4093, i64 %4123, i1 true) #3, !dbg !197 + %4189 = add i32 %2897, 4096, !dbg !197 + %4190 = lshr exact i32 %4189, 4, !dbg !197 + %4191 = and i32 %4190, 16383, !dbg !197 + %4192 = zext nneg i32 %4191 to i64, !dbg !197 + %4193 = or disjoint i64 %4192, 4611686293338849280, !dbg !197 + %4194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 0, !dbg !197 + %4195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 1, !dbg !197 + %4196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 2, !dbg !197 + %4197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 3, !dbg !197 + %4198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 4, !dbg !197 + %4199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 5, !dbg !197 + %4200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 6, !dbg !197 + %4201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 7, !dbg !197 + %4202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 8, !dbg !197 + %4203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 9, !dbg !197 + %4204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 10, !dbg !197 + %4205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 11, !dbg !197 + %4206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 12, !dbg !197 + %4207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 13, !dbg !197 + %4208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 14, !dbg !197 + %4209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 15, !dbg !197 + %4210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 16, !dbg !197 + %4211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 17, !dbg !197 + %4212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 18, !dbg !197 + %4213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 19, !dbg !197 + %4214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 20, !dbg !197 + %4215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 21, !dbg !197 + %4216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 22, !dbg !197 + %4217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 23, !dbg !197 + %4218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 24, !dbg !197 + %4219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 25, !dbg !197 + %4220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 26, !dbg !197 + %4221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 27, !dbg !197 + %4222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 28, !dbg !197 + %4223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 29, !dbg !197 + %4224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 30, !dbg !197 + %4225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 31, !dbg !197 + %4226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 32, !dbg !197 + %4227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 33, !dbg !197 + %4228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 34, !dbg !197 + %4229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 35, !dbg !197 + %4230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 36, !dbg !197 + %4231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 37, !dbg !197 + %4232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 38, !dbg !197 + %4233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 39, !dbg !197 + %4234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 40, !dbg !197 + %4235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 41, !dbg !197 + %4236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 42, !dbg !197 + %4237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 43, !dbg !197 + %4238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 44, !dbg !197 + %4239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 45, !dbg !197 + %4240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 46, !dbg !197 + %4241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 47, !dbg !197 + %4242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 48, !dbg !197 + %4243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 49, !dbg !197 + %4244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 50, !dbg !197 + %4245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 51, !dbg !197 + %4246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 52, !dbg !197 + %4247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 53, !dbg !197 + %4248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 54, !dbg !197 + %4249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 55, !dbg !197 + %4250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 56, !dbg !197 + %4251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 57, !dbg !197 + %4252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 58, !dbg !197 + %4253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 59, !dbg !197 + %4254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 60, !dbg !197 + %4255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 61, !dbg !197 + %4256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 62, !dbg !197 + %4257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 63, !dbg !197 + %4258 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4194, float %4195, float %4196, float %4197, float %4198, float %4199, float %4200, float %4201, float %4202, float %4203, float %4204, float %4205, float %4206, float %4207, float %4208, float %4209, float %4210, float %4211, float %4212, float %4213, float %4214, float %4215, float %4216, float %4217, float %4218, float %4219, float %4220, float %4221, float %4222, float %4223, float %4224, float %4225, float %4226, float %4227, float %4228, float %4229, float %4230, float %4231, float %4232, float %4233, float %4234, float %4235, float %4236, float %4237, float %4238, float %4239, float %4240, float %4241, float %4242, float %4243, float %4244, float %4245, float %4246, float %4247, float %4248, float %4249, float %4250, float %4251, float %4252, float %4253, float %4254, float %4255, float %4256, float %4257, i32 %4096, i32 %4099, i32 %4102, i32 %4105, i64 %4193, i1 true) #3, !dbg !197 + %4259 = add i32 %2897, 6144, !dbg !197 + %4260 = lshr exact i32 %4259, 4, !dbg !197 + %4261 = and i32 %4260, 16383, !dbg !197 + %4262 = zext nneg i32 %4261 to i64, !dbg !197 + %4263 = or disjoint i64 %4262, 4611686293338849280, !dbg !197 + %4264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 0, !dbg !197 + %4265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 1, !dbg !197 + %4266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 2, !dbg !197 + %4267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 3, !dbg !197 + %4268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 4, !dbg !197 + %4269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 5, !dbg !197 + %4270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 6, !dbg !197 + %4271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 7, !dbg !197 + %4272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 8, !dbg !197 + %4273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 9, !dbg !197 + %4274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 10, !dbg !197 + %4275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 11, !dbg !197 + %4276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 12, !dbg !197 + %4277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 13, !dbg !197 + %4278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 14, !dbg !197 + %4279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 15, !dbg !197 + %4280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 16, !dbg !197 + %4281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 17, !dbg !197 + %4282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 18, !dbg !197 + %4283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 19, !dbg !197 + %4284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 20, !dbg !197 + %4285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 21, !dbg !197 + %4286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 22, !dbg !197 + %4287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 23, !dbg !197 + %4288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 24, !dbg !197 + %4289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 25, !dbg !197 + %4290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 26, !dbg !197 + %4291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 27, !dbg !197 + %4292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 28, !dbg !197 + %4293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 29, !dbg !197 + %4294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 30, !dbg !197 + %4295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 31, !dbg !197 + %4296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 32, !dbg !197 + %4297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 33, !dbg !197 + %4298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 34, !dbg !197 + %4299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 35, !dbg !197 + %4300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 36, !dbg !197 + %4301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 37, !dbg !197 + %4302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 38, !dbg !197 + %4303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 39, !dbg !197 + %4304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 40, !dbg !197 + %4305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 41, !dbg !197 + %4306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 42, !dbg !197 + %4307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 43, !dbg !197 + %4308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 44, !dbg !197 + %4309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 45, !dbg !197 + %4310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 46, !dbg !197 + %4311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 47, !dbg !197 + %4312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 48, !dbg !197 + %4313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 49, !dbg !197 + %4314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 50, !dbg !197 + %4315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 51, !dbg !197 + %4316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 52, !dbg !197 + %4317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 53, !dbg !197 + %4318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 54, !dbg !197 + %4319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 55, !dbg !197 + %4320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 56, !dbg !197 + %4321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 57, !dbg !197 + %4322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 58, !dbg !197 + %4323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 59, !dbg !197 + %4324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 60, !dbg !197 + %4325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 61, !dbg !197 + %4326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 62, !dbg !197 + %4327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 63, !dbg !197 + %4328 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4264, float %4265, float %4266, float %4267, float %4268, float %4269, float %4270, float %4271, float %4272, float %4273, float %4274, float %4275, float %4276, float %4277, float %4278, float %4279, float %4280, float %4281, float %4282, float %4283, float %4284, float %4285, float %4286, float %4287, float %4288, float %4289, float %4290, float %4291, float %4292, float %4293, float %4294, float %4295, float %4296, float %4297, float %4298, float %4299, float %4300, float %4301, float %4302, float %4303, float %4304, float %4305, float %4306, float %4307, float %4308, float %4309, float %4310, float %4311, float %4312, float %4313, float %4314, float %4315, float %4316, float %4317, float %4318, float %4319, float %4320, float %4321, float %4322, float %4323, float %4324, float %4325, float %4326, float %4327, i32 %4108, i32 %4111, i32 %4114, i32 %4117, i64 %4263, i1 true) #3, !dbg !197 + %4329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 0, !dbg !197 + %4330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 1, !dbg !197 + %4331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 2, !dbg !197 + %4332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 3, !dbg !197 + %4333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 4, !dbg !197 + %4334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 5, !dbg !197 + %4335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 6, !dbg !197 + %4336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 7, !dbg !197 + %4337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 8, !dbg !197 + %4338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 9, !dbg !197 + %4339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 10, !dbg !197 + %4340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 11, !dbg !197 + %4341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 12, !dbg !197 + %4342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 13, !dbg !197 + %4343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 14, !dbg !197 + %4344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 15, !dbg !197 + %4345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 16, !dbg !197 + %4346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 17, !dbg !197 + %4347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 18, !dbg !197 + %4348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 19, !dbg !197 + %4349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 20, !dbg !197 + %4350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 21, !dbg !197 + %4351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 22, !dbg !197 + %4352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 23, !dbg !197 + %4353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 24, !dbg !197 + %4354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 25, !dbg !197 + %4355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 26, !dbg !197 + %4356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 27, !dbg !197 + %4357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 28, !dbg !197 + %4358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 29, !dbg !197 + %4359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 30, !dbg !197 + %4360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 31, !dbg !197 + %4361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 32, !dbg !197 + %4362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 33, !dbg !197 + %4363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 34, !dbg !197 + %4364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 35, !dbg !197 + %4365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 36, !dbg !197 + %4366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 37, !dbg !197 + %4367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 38, !dbg !197 + %4368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 39, !dbg !197 + %4369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 40, !dbg !197 + %4370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 41, !dbg !197 + %4371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 42, !dbg !197 + %4372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 43, !dbg !197 + %4373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 44, !dbg !197 + %4374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 45, !dbg !197 + %4375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 46, !dbg !197 + %4376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 47, !dbg !197 + %4377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 48, !dbg !197 + %4378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 49, !dbg !197 + %4379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 50, !dbg !197 + %4380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 51, !dbg !197 + %4381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 52, !dbg !197 + %4382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 53, !dbg !197 + %4383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 54, !dbg !197 + %4384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 55, !dbg !197 + %4385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 56, !dbg !197 + %4386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 57, !dbg !197 + %4387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 58, !dbg !197 + %4388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 59, !dbg !197 + %4389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 60, !dbg !197 + %4390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 61, !dbg !197 + %4391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 62, !dbg !197 + %4392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 63, !dbg !197 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !197 + %4393 = insertelement <16 x i32> poison, i32 %2845, i64 0, !dbg !185 + %4394 = shufflevector <16 x i32> %4393, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !185 + %4395 = add <16 x i32> %4394, %2849, !dbg !185 + %4396 = add nuw nsw i32 %2848, 1, !dbg !180 + %4397 = lshr i32 %4396, 1, !dbg !198 + %4398 = zext nneg i32 %4397 to i64, !dbg !199 + %4399 = getelementptr i32, ptr addrspace(1) %2631, i64 %4398, !dbg !199 + %4400 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !200 + %4401 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4399, i64 %4400, i1 %2851) #3, !dbg !200 + %4402 = add nuw nsw i32 %4397, 1, !dbg !201 + %4403 = icmp slt i32 %4402, %2635, !dbg !202 + %4404 = getelementptr i8, ptr addrspace(1) %4399, i64 4, !dbg !203 + %4405 = and i1 %2851, %4403, !dbg !180 + %4406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !204 + %4407 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4404, i64 %4406, i1 %4405) #3, !dbg !204 + %4408 = and i32 %2848, 1, !dbg !205 + %4409 = sub i32 %4407, %4401, !dbg !206 + %4410 = shl i32 %4409, 7, !dbg !207 + %4411 = add i32 %4410, -64, !dbg !208 + %4412 = xor i32 %4408, 1, !dbg !209 + %4413 = mul nuw nsw i32 %4411, %4412, !dbg !209 + %4414 = shl nuw nsw i32 %4408, 6, !dbg !210 + %4415 = add i32 %4413, %4414, !dbg !211 + %4416 = shl i32 %4415, 7, !dbg !212 + %4417 = sext i32 %4416 to i64, !dbg !183 + %4418 = getelementptr bfloat, ptr addrspace(1) %.pn10421563, i64 %4417, !dbg !183 + %4419 = getelementptr bfloat, ptr addrspace(1) %.pn10261564, i64 %4417, !dbg !183 + %4420 = getelementptr bfloat, ptr addrspace(1) %.pn10101565, i64 %4417, !dbg !183 + %4421 = getelementptr bfloat, ptr addrspace(1) %.pn9941566, i64 %4417, !dbg !183 + %4422 = getelementptr bfloat, ptr addrspace(1) %.pn11141571, i64 %4417, !dbg !184 + %4423 = getelementptr bfloat, ptr addrspace(1) %.pn10981572, i64 %4417, !dbg !184 + %4424 = getelementptr bfloat, ptr addrspace(1) %.pn10821573, i64 %4417, !dbg !184 + %4425 = getelementptr bfloat, ptr addrspace(1) %.pn10661574, i64 %4417, !dbg !184 + %4426 = add i32 %4415, %.pn10501567, !dbg !185 + %4427 = add i32 %4415, %.pn10481568, !dbg !185 + %4428 = add i32 %4415, %.pn10461569, !dbg !185 + %4429 = add i32 %4415, %.pn10441570, !dbg !185 + %4430 = add i32 %2847, 1, !dbg !180 + %4431 = icmp sgt i32 %4430, 2, !dbg !180 + %4432 = select i1 %4431, i32 0, i32 %4430, !dbg !180 + %4433 = icmp slt i32 %4426, %19, !dbg !181 + %4434 = icmp slt i32 %4427, %19, !dbg !181 + %4435 = icmp slt i32 %4428, %19, !dbg !181 + %4436 = icmp slt i32 %4429, %19, !dbg !181 + %4437 = shl i32 %4432, 13, !dbg !182 + %4438 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %4437, !dbg !182 + %4439 = and i1 %2850, %4433, !dbg !180 + %4440 = and i1 %2850, %4434, !dbg !180 + %4441 = and i1 %2850, %4435, !dbg !180 + %4442 = and i1 %2850, %4436, !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !182 + %4443 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %448, !dbg !182 + %4444 = select i1 %4439, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4443, ptr addrspace(1) %4418, i32 %4444) #3, !dbg !182 + %4445 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %451, !dbg !182 + %4446 = select i1 %4440, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4445, ptr addrspace(1) %4419, i32 %4446) #3, !dbg !182 + %4447 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %454, !dbg !182 + %4448 = select i1 %4441, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4447, ptr addrspace(1) %4420, i32 %4448) #3, !dbg !182 + %4449 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %457, !dbg !182 + %4450 = select i1 %4442, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4449, ptr addrspace(1) %4421, i32 %4450) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + %4451 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4437, !dbg !182 + %4452 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %448, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4452, ptr addrspace(1) %4422, i32 %4444) #3, !dbg !182 + %4453 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %451, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4453, ptr addrspace(1) %4423, i32 %4446) #3, !dbg !182 + %4454 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %454, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4454, ptr addrspace(1) %4424, i32 %4448) #3, !dbg !182 + %4455 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %457, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4455, ptr addrspace(1) %4425, i32 %4450) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + %exitcond2184.not = icmp eq i32 %4396, %smax2183, !dbg !180 + br i1 %exitcond2184.not, label %._crit_edge1593, label %2844, !dbg !180 + +._crit_edge1593: ; preds = %__nv_exp2f.exit1431, %._crit_edge + %4456 = phi float [ %2704, %._crit_edge ], [ %4329, %__nv_exp2f.exit1431 ], !dbg !99 + %4457 = phi float [ %2705, %._crit_edge ], [ %4330, %__nv_exp2f.exit1431 ], !dbg !99 + %4458 = phi float [ %2706, %._crit_edge ], [ %4331, %__nv_exp2f.exit1431 ], !dbg !99 + %4459 = phi float [ %2707, %._crit_edge ], [ %4332, %__nv_exp2f.exit1431 ], !dbg !99 + %4460 = phi float [ %2708, %._crit_edge ], [ %4333, %__nv_exp2f.exit1431 ], !dbg !99 + %4461 = phi float [ %2709, %._crit_edge ], [ %4334, %__nv_exp2f.exit1431 ], !dbg !99 + %4462 = phi float [ %2710, %._crit_edge ], [ %4335, %__nv_exp2f.exit1431 ], !dbg !99 + %4463 = phi float [ %2711, %._crit_edge ], [ %4336, %__nv_exp2f.exit1431 ], !dbg !99 + %4464 = phi float [ %2712, %._crit_edge ], [ %4337, %__nv_exp2f.exit1431 ], !dbg !99 + %4465 = phi float [ %2713, %._crit_edge ], [ %4338, %__nv_exp2f.exit1431 ], !dbg !99 + %4466 = phi float [ %2714, %._crit_edge ], [ %4339, %__nv_exp2f.exit1431 ], !dbg !99 + %4467 = phi float [ %2715, %._crit_edge ], [ %4340, %__nv_exp2f.exit1431 ], !dbg !99 + %4468 = phi float [ %2716, %._crit_edge ], [ %4341, %__nv_exp2f.exit1431 ], !dbg !99 + %4469 = phi float [ %2717, %._crit_edge ], [ %4342, %__nv_exp2f.exit1431 ], !dbg !99 + %4470 = phi float [ %2718, %._crit_edge ], [ %4343, %__nv_exp2f.exit1431 ], !dbg !99 + %4471 = phi float [ %2719, %._crit_edge ], [ %4344, %__nv_exp2f.exit1431 ], !dbg !99 + %4472 = phi float [ %2720, %._crit_edge ], [ %4345, %__nv_exp2f.exit1431 ], !dbg !99 + %4473 = phi float [ %2721, %._crit_edge ], [ %4346, %__nv_exp2f.exit1431 ], !dbg !99 + %4474 = phi float [ %2722, %._crit_edge ], [ %4347, %__nv_exp2f.exit1431 ], !dbg !99 + %4475 = phi float [ %2723, %._crit_edge ], [ %4348, %__nv_exp2f.exit1431 ], !dbg !99 + %4476 = phi float [ %2724, %._crit_edge ], [ %4349, %__nv_exp2f.exit1431 ], !dbg !99 + %4477 = phi float [ %2725, %._crit_edge ], [ %4350, %__nv_exp2f.exit1431 ], !dbg !99 + %4478 = phi float [ %2726, %._crit_edge ], [ %4351, %__nv_exp2f.exit1431 ], !dbg !99 + %4479 = phi float [ %2727, %._crit_edge ], [ %4352, %__nv_exp2f.exit1431 ], !dbg !99 + %4480 = phi float [ %2728, %._crit_edge ], [ %4353, %__nv_exp2f.exit1431 ], !dbg !99 + %4481 = phi float [ %2729, %._crit_edge ], [ %4354, %__nv_exp2f.exit1431 ], !dbg !99 + %4482 = phi float [ %2730, %._crit_edge ], [ %4355, %__nv_exp2f.exit1431 ], !dbg !99 + %4483 = phi float [ %2731, %._crit_edge ], [ %4356, %__nv_exp2f.exit1431 ], !dbg !99 + %4484 = phi float [ %2732, %._crit_edge ], [ %4357, %__nv_exp2f.exit1431 ], !dbg !99 + %4485 = phi float [ %2733, %._crit_edge ], [ %4358, %__nv_exp2f.exit1431 ], !dbg !99 + %4486 = phi float [ %2734, %._crit_edge ], [ %4359, %__nv_exp2f.exit1431 ], !dbg !99 + %4487 = phi float [ %2735, %._crit_edge ], [ %4360, %__nv_exp2f.exit1431 ], !dbg !99 + %4488 = phi float [ %2736, %._crit_edge ], [ %4361, %__nv_exp2f.exit1431 ], !dbg !99 + %4489 = phi float [ %2737, %._crit_edge ], [ %4362, %__nv_exp2f.exit1431 ], !dbg !99 + %4490 = phi float [ %2738, %._crit_edge ], [ %4363, %__nv_exp2f.exit1431 ], !dbg !99 + %4491 = phi float [ %2739, %._crit_edge ], [ %4364, %__nv_exp2f.exit1431 ], !dbg !99 + %4492 = phi float [ %2740, %._crit_edge ], [ %4365, %__nv_exp2f.exit1431 ], !dbg !99 + %4493 = phi float [ %2741, %._crit_edge ], [ %4366, %__nv_exp2f.exit1431 ], !dbg !99 + %4494 = phi float [ %2742, %._crit_edge ], [ %4367, %__nv_exp2f.exit1431 ], !dbg !99 + %4495 = phi float [ %2743, %._crit_edge ], [ %4368, %__nv_exp2f.exit1431 ], !dbg !99 + %4496 = phi float [ %2744, %._crit_edge ], [ %4369, %__nv_exp2f.exit1431 ], !dbg !99 + %4497 = phi float [ %2745, %._crit_edge ], [ %4370, %__nv_exp2f.exit1431 ], !dbg !99 + %4498 = phi float [ %2746, %._crit_edge ], [ %4371, %__nv_exp2f.exit1431 ], !dbg !99 + %4499 = phi float [ %2747, %._crit_edge ], [ %4372, %__nv_exp2f.exit1431 ], !dbg !99 + %4500 = phi float [ %2748, %._crit_edge ], [ %4373, %__nv_exp2f.exit1431 ], !dbg !99 + %4501 = phi float [ %2749, %._crit_edge ], [ %4374, %__nv_exp2f.exit1431 ], !dbg !99 + %4502 = phi float [ %2750, %._crit_edge ], [ %4375, %__nv_exp2f.exit1431 ], !dbg !99 + %4503 = phi float [ %2751, %._crit_edge ], [ %4376, %__nv_exp2f.exit1431 ], !dbg !99 + %4504 = phi float [ %2752, %._crit_edge ], [ %4377, %__nv_exp2f.exit1431 ], !dbg !99 + %4505 = phi float [ %2753, %._crit_edge ], [ %4378, %__nv_exp2f.exit1431 ], !dbg !99 + %4506 = phi float [ %2754, %._crit_edge ], [ %4379, %__nv_exp2f.exit1431 ], !dbg !99 + %4507 = phi float [ %2755, %._crit_edge ], [ %4380, %__nv_exp2f.exit1431 ], !dbg !99 + %4508 = phi float [ %2756, %._crit_edge ], [ %4381, %__nv_exp2f.exit1431 ], !dbg !99 + %4509 = phi float [ %2757, %._crit_edge ], [ %4382, %__nv_exp2f.exit1431 ], !dbg !99 + %4510 = phi float [ %2758, %._crit_edge ], [ %4383, %__nv_exp2f.exit1431 ], !dbg !99 + %4511 = phi float [ %2759, %._crit_edge ], [ %4384, %__nv_exp2f.exit1431 ], !dbg !99 + %4512 = phi float [ %2760, %._crit_edge ], [ %4385, %__nv_exp2f.exit1431 ], !dbg !99 + %4513 = phi float [ %2761, %._crit_edge ], [ %4386, %__nv_exp2f.exit1431 ], !dbg !99 + %4514 = phi float [ %2762, %._crit_edge ], [ %4387, %__nv_exp2f.exit1431 ], !dbg !99 + %4515 = phi float [ %2763, %._crit_edge ], [ %4388, %__nv_exp2f.exit1431 ], !dbg !99 + %4516 = phi float [ %2764, %._crit_edge ], [ %4389, %__nv_exp2f.exit1431 ], !dbg !99 + %4517 = phi float [ %2765, %._crit_edge ], [ %4390, %__nv_exp2f.exit1431 ], !dbg !99 + %4518 = phi float [ %2766, %._crit_edge ], [ %4391, %__nv_exp2f.exit1431 ], !dbg !99 + %4519 = phi float [ %2767, %._crit_edge ], [ %4392, %__nv_exp2f.exit1431 ], !dbg !99 + %4520 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %4456, float %4457, float %4458, float %4459, float %4460, float %4461, float %4462, float %4463, float %4464, float %4465, float %4466, float %4467, float %4468, float %4469, float %4470, float %4471, float %4472, float %4473, float %4474, float %4475, float %4476, float %4477, float %4478, float %4479, float %4480, float %4481, float %4482, float %4483, float %4484, float %4485, float %4486, float %4487, float %4488, float %4489, float %4490, float %4491, float %4492, float %4493, float %4494, float %4495, float %4496, float %4497, float %4498, float %4499, float %4500, float %4501, float %4502, float %4503, float %4504, float %4505, float %4506, float %4507, float %4508, float %4509, float %4510, float %4511, float %4512, float %4513, float %4514, float %4515, float %4516, float %4517, float %4518, float %4519) #3, !dbg !180 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !180 + %4521 = getelementptr bfloat, ptr addrspace(1) %98, i64 %120, !dbg !213 + %4522 = getelementptr bfloat, ptr addrspace(1) %98, i64 %122, !dbg !213 + %4523 = getelementptr bfloat, ptr addrspace(1) %98, i64 %124, !dbg !213 + %4524 = getelementptr bfloat, ptr addrspace(1) %98, i64 %126, !dbg !213 + %4525 = getelementptr bfloat, ptr addrspace(1) %98, i64 %128, !dbg !213 + %4526 = getelementptr bfloat, ptr addrspace(1) %98, i64 %130, !dbg !213 + %4527 = getelementptr bfloat, ptr addrspace(1) %98, i64 %132, !dbg !213 + %4528 = getelementptr bfloat, ptr addrspace(1) %98, i64 %134, !dbg !213 + %4529 = getelementptr bfloat, ptr addrspace(1) %4521, i64 %138, !dbg !214 + %4530 = getelementptr bfloat, ptr addrspace(1) %4522, i64 %138, !dbg !214 + %4531 = getelementptr bfloat, ptr addrspace(1) %4523, i64 %138, !dbg !214 + %4532 = getelementptr bfloat, ptr addrspace(1) %4524, i64 %138, !dbg !214 + %4533 = getelementptr bfloat, ptr addrspace(1) %4525, i64 %138, !dbg !214 + %4534 = getelementptr bfloat, ptr addrspace(1) %4526, i64 %138, !dbg !214 + %4535 = getelementptr bfloat, ptr addrspace(1) %4527, i64 %138, !dbg !214 + %4536 = getelementptr bfloat, ptr addrspace(1) %4528, i64 %138, !dbg !214 + %4537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 0, !dbg !215 + %4538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 1, !dbg !215 + %4539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 2, !dbg !215 + %4540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 3, !dbg !215 + %4541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 4, !dbg !215 + %4542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 5, !dbg !215 + %4543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 6, !dbg !215 + %4544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 7, !dbg !215 + %4545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 8, !dbg !215 + %4546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 9, !dbg !215 + %4547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 10, !dbg !215 + %4548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 11, !dbg !215 + %4549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 12, !dbg !215 + %4550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 13, !dbg !215 + %4551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 14, !dbg !215 + %4552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 15, !dbg !215 + %4553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 16, !dbg !215 + %4554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 17, !dbg !215 + %4555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 18, !dbg !215 + %4556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 19, !dbg !215 + %4557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 20, !dbg !215 + %4558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 21, !dbg !215 + %4559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 22, !dbg !215 + %4560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 23, !dbg !215 + %4561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 24, !dbg !215 + %4562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 25, !dbg !215 + %4563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 26, !dbg !215 + %4564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 27, !dbg !215 + %4565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 28, !dbg !215 + %4566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 29, !dbg !215 + %4567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 30, !dbg !215 + %4568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 31, !dbg !215 + %4569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 32, !dbg !215 + %4570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 33, !dbg !215 + %4571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 34, !dbg !215 + %4572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 35, !dbg !215 + %4573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 36, !dbg !215 + %4574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 37, !dbg !215 + %4575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 38, !dbg !215 + %4576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 39, !dbg !215 + %4577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 40, !dbg !215 + %4578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 41, !dbg !215 + %4579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 42, !dbg !215 + %4580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 43, !dbg !215 + %4581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 44, !dbg !215 + %4582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 45, !dbg !215 + %4583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 46, !dbg !215 + %4584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 47, !dbg !215 + %4585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 48, !dbg !215 + %4586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 49, !dbg !215 + %4587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 50, !dbg !215 + %4588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 51, !dbg !215 + %4589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 52, !dbg !215 + %4590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 53, !dbg !215 + %4591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 54, !dbg !215 + %4592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 55, !dbg !215 + %4593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 56, !dbg !215 + %4594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 57, !dbg !215 + %4595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 58, !dbg !215 + %4596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 59, !dbg !215 + %4597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 60, !dbg !215 + %4598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 61, !dbg !215 + %4599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 62, !dbg !215 + %4600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 63, !dbg !215 + %4601 = insertelement <2 x float> poison, float %4537, i64 0, !dbg !215 + %4602 = insertelement <2 x float> %4601, float %4538, i64 1, !dbg !215 + %4603 = fmul <2 x float> %4602, splat (float 0x3FB6A09E60000000), !dbg !215 + %4604 = fptrunc <2 x float> %4603 to <2 x bfloat>, !dbg !216 + %4605 = insertelement <2 x float> poison, float %4539, i64 0, !dbg !215 + %4606 = insertelement <2 x float> %4605, float %4540, i64 1, !dbg !215 + %4607 = fmul <2 x float> %4606, splat (float 0x3FB6A09E60000000), !dbg !215 + %4608 = fptrunc <2 x float> %4607 to <2 x bfloat>, !dbg !216 + %4609 = insertelement <2 x float> poison, float %4541, i64 0, !dbg !215 + %4610 = insertelement <2 x float> %4609, float %4542, i64 1, !dbg !215 + %4611 = fmul <2 x float> %4610, splat (float 0x3FB6A09E60000000), !dbg !215 + %4612 = fptrunc <2 x float> %4611 to <2 x bfloat>, !dbg !216 + %4613 = insertelement <2 x float> poison, float %4543, i64 0, !dbg !215 + %4614 = insertelement <2 x float> %4613, float %4544, i64 1, !dbg !215 + %4615 = fmul <2 x float> %4614, splat (float 0x3FB6A09E60000000), !dbg !215 + %4616 = fptrunc <2 x float> %4615 to <2 x bfloat>, !dbg !216 + %4617 = insertelement <2 x float> poison, float %4545, i64 0, !dbg !215 + %4618 = insertelement <2 x float> %4617, float %4546, i64 1, !dbg !215 + %4619 = fmul <2 x float> %4618, splat (float 0x3FB6A09E60000000), !dbg !215 + %4620 = fptrunc <2 x float> %4619 to <2 x bfloat>, !dbg !216 + %4621 = insertelement <2 x float> poison, float %4547, i64 0, !dbg !215 + %4622 = insertelement <2 x float> %4621, float %4548, i64 1, !dbg !215 + %4623 = fmul <2 x float> %4622, splat (float 0x3FB6A09E60000000), !dbg !215 + %4624 = fptrunc <2 x float> %4623 to <2 x bfloat>, !dbg !216 + %4625 = insertelement <2 x float> poison, float %4549, i64 0, !dbg !215 + %4626 = insertelement <2 x float> %4625, float %4550, i64 1, !dbg !215 + %4627 = fmul <2 x float> %4626, splat (float 0x3FB6A09E60000000), !dbg !215 + %4628 = fptrunc <2 x float> %4627 to <2 x bfloat>, !dbg !216 + %4629 = insertelement <2 x float> poison, float %4551, i64 0, !dbg !215 + %4630 = insertelement <2 x float> %4629, float %4552, i64 1, !dbg !215 + %4631 = fmul <2 x float> %4630, splat (float 0x3FB6A09E60000000), !dbg !215 + %4632 = fptrunc <2 x float> %4631 to <2 x bfloat>, !dbg !216 + %4633 = insertelement <2 x float> poison, float %4553, i64 0, !dbg !215 + %4634 = insertelement <2 x float> %4633, float %4554, i64 1, !dbg !215 + %4635 = fmul <2 x float> %4634, splat (float 0x3FB6A09E60000000), !dbg !215 + %4636 = fptrunc <2 x float> %4635 to <2 x bfloat>, !dbg !216 + %4637 = insertelement <2 x float> poison, float %4555, i64 0, !dbg !215 + %4638 = insertelement <2 x float> %4637, float %4556, i64 1, !dbg !215 + %4639 = fmul <2 x float> %4638, splat (float 0x3FB6A09E60000000), !dbg !215 + %4640 = fptrunc <2 x float> %4639 to <2 x bfloat>, !dbg !216 + %4641 = insertelement <2 x float> poison, float %4557, i64 0, !dbg !215 + %4642 = insertelement <2 x float> %4641, float %4558, i64 1, !dbg !215 + %4643 = fmul <2 x float> %4642, splat (float 0x3FB6A09E60000000), !dbg !215 + %4644 = fptrunc <2 x float> %4643 to <2 x bfloat>, !dbg !216 + %4645 = insertelement <2 x float> poison, float %4559, i64 0, !dbg !215 + %4646 = insertelement <2 x float> %4645, float %4560, i64 1, !dbg !215 + %4647 = fmul <2 x float> %4646, splat (float 0x3FB6A09E60000000), !dbg !215 + %4648 = fptrunc <2 x float> %4647 to <2 x bfloat>, !dbg !216 + %4649 = insertelement <2 x float> poison, float %4561, i64 0, !dbg !215 + %4650 = insertelement <2 x float> %4649, float %4562, i64 1, !dbg !215 + %4651 = fmul <2 x float> %4650, splat (float 0x3FB6A09E60000000), !dbg !215 + %4652 = fptrunc <2 x float> %4651 to <2 x bfloat>, !dbg !216 + %4653 = insertelement <2 x float> poison, float %4563, i64 0, !dbg !215 + %4654 = insertelement <2 x float> %4653, float %4564, i64 1, !dbg !215 + %4655 = fmul <2 x float> %4654, splat (float 0x3FB6A09E60000000), !dbg !215 + %4656 = fptrunc <2 x float> %4655 to <2 x bfloat>, !dbg !216 + %4657 = insertelement <2 x float> poison, float %4565, i64 0, !dbg !215 + %4658 = insertelement <2 x float> %4657, float %4566, i64 1, !dbg !215 + %4659 = fmul <2 x float> %4658, splat (float 0x3FB6A09E60000000), !dbg !215 + %4660 = fptrunc <2 x float> %4659 to <2 x bfloat>, !dbg !216 + %4661 = insertelement <2 x float> poison, float %4567, i64 0, !dbg !215 + %4662 = insertelement <2 x float> %4661, float %4568, i64 1, !dbg !215 + %4663 = fmul <2 x float> %4662, splat (float 0x3FB6A09E60000000), !dbg !215 + %4664 = fptrunc <2 x float> %4663 to <2 x bfloat>, !dbg !216 + %4665 = insertelement <2 x float> poison, float %4569, i64 0, !dbg !215 + %4666 = insertelement <2 x float> %4665, float %4570, i64 1, !dbg !215 + %4667 = fmul <2 x float> %4666, splat (float 0x3FB6A09E60000000), !dbg !215 + %4668 = fptrunc <2 x float> %4667 to <2 x bfloat>, !dbg !216 + %4669 = insertelement <2 x float> poison, float %4571, i64 0, !dbg !215 + %4670 = insertelement <2 x float> %4669, float %4572, i64 1, !dbg !215 + %4671 = fmul <2 x float> %4670, splat (float 0x3FB6A09E60000000), !dbg !215 + %4672 = fptrunc <2 x float> %4671 to <2 x bfloat>, !dbg !216 + %4673 = insertelement <2 x float> poison, float %4573, i64 0, !dbg !215 + %4674 = insertelement <2 x float> %4673, float %4574, i64 1, !dbg !215 + %4675 = fmul <2 x float> %4674, splat (float 0x3FB6A09E60000000), !dbg !215 + %4676 = fptrunc <2 x float> %4675 to <2 x bfloat>, !dbg !216 + %4677 = insertelement <2 x float> poison, float %4575, i64 0, !dbg !215 + %4678 = insertelement <2 x float> %4677, float %4576, i64 1, !dbg !215 + %4679 = fmul <2 x float> %4678, splat (float 0x3FB6A09E60000000), !dbg !215 + %4680 = fptrunc <2 x float> %4679 to <2 x bfloat>, !dbg !216 + %4681 = insertelement <2 x float> poison, float %4577, i64 0, !dbg !215 + %4682 = insertelement <2 x float> %4681, float %4578, i64 1, !dbg !215 + %4683 = fmul <2 x float> %4682, splat (float 0x3FB6A09E60000000), !dbg !215 + %4684 = fptrunc <2 x float> %4683 to <2 x bfloat>, !dbg !216 + %4685 = insertelement <2 x float> poison, float %4579, i64 0, !dbg !215 + %4686 = insertelement <2 x float> %4685, float %4580, i64 1, !dbg !215 + %4687 = fmul <2 x float> %4686, splat (float 0x3FB6A09E60000000), !dbg !215 + %4688 = fptrunc <2 x float> %4687 to <2 x bfloat>, !dbg !216 + %4689 = insertelement <2 x float> poison, float %4581, i64 0, !dbg !215 + %4690 = insertelement <2 x float> %4689, float %4582, i64 1, !dbg !215 + %4691 = fmul <2 x float> %4690, splat (float 0x3FB6A09E60000000), !dbg !215 + %4692 = fptrunc <2 x float> %4691 to <2 x bfloat>, !dbg !216 + %4693 = insertelement <2 x float> poison, float %4583, i64 0, !dbg !215 + %4694 = insertelement <2 x float> %4693, float %4584, i64 1, !dbg !215 + %4695 = fmul <2 x float> %4694, splat (float 0x3FB6A09E60000000), !dbg !215 + %4696 = fptrunc <2 x float> %4695 to <2 x bfloat>, !dbg !216 + %4697 = insertelement <2 x float> poison, float %4585, i64 0, !dbg !215 + %4698 = insertelement <2 x float> %4697, float %4586, i64 1, !dbg !215 + %4699 = fmul <2 x float> %4698, splat (float 0x3FB6A09E60000000), !dbg !215 + %4700 = fptrunc <2 x float> %4699 to <2 x bfloat>, !dbg !216 + %4701 = insertelement <2 x float> poison, float %4587, i64 0, !dbg !215 + %4702 = insertelement <2 x float> %4701, float %4588, i64 1, !dbg !215 + %4703 = fmul <2 x float> %4702, splat (float 0x3FB6A09E60000000), !dbg !215 + %4704 = fptrunc <2 x float> %4703 to <2 x bfloat>, !dbg !216 + %4705 = insertelement <2 x float> poison, float %4589, i64 0, !dbg !215 + %4706 = insertelement <2 x float> %4705, float %4590, i64 1, !dbg !215 + %4707 = fmul <2 x float> %4706, splat (float 0x3FB6A09E60000000), !dbg !215 + %4708 = fptrunc <2 x float> %4707 to <2 x bfloat>, !dbg !216 + %4709 = insertelement <2 x float> poison, float %4591, i64 0, !dbg !215 + %4710 = insertelement <2 x float> %4709, float %4592, i64 1, !dbg !215 + %4711 = fmul <2 x float> %4710, splat (float 0x3FB6A09E60000000), !dbg !215 + %4712 = fptrunc <2 x float> %4711 to <2 x bfloat>, !dbg !216 + %4713 = insertelement <2 x float> poison, float %4593, i64 0, !dbg !215 + %4714 = insertelement <2 x float> %4713, float %4594, i64 1, !dbg !215 + %4715 = fmul <2 x float> %4714, splat (float 0x3FB6A09E60000000), !dbg !215 + %4716 = fptrunc <2 x float> %4715 to <2 x bfloat>, !dbg !216 + %4717 = insertelement <2 x float> poison, float %4595, i64 0, !dbg !215 + %4718 = insertelement <2 x float> %4717, float %4596, i64 1, !dbg !215 + %4719 = fmul <2 x float> %4718, splat (float 0x3FB6A09E60000000), !dbg !215 + %4720 = fptrunc <2 x float> %4719 to <2 x bfloat>, !dbg !216 + %4721 = insertelement <2 x float> poison, float %4597, i64 0, !dbg !215 + %4722 = insertelement <2 x float> %4721, float %4598, i64 1, !dbg !215 + %4723 = fmul <2 x float> %4722, splat (float 0x3FB6A09E60000000), !dbg !215 + %4724 = fptrunc <2 x float> %4723 to <2 x bfloat>, !dbg !216 + %4725 = insertelement <2 x float> poison, float %4599, i64 0, !dbg !215 + %4726 = insertelement <2 x float> %4725, float %4600, i64 1, !dbg !215 + %4727 = fmul <2 x float> %4726, splat (float 0x3FB6A09E60000000), !dbg !215 + %4728 = fptrunc <2 x float> %4727 to <2 x bfloat>, !dbg !216 + %4729 = shl nuw nsw i32 %390, 13, !dbg !216 + %4730 = shl nuw nsw i32 %56, 5, !dbg !216 + %4731 = and i32 %4730, 7264, !dbg !216 + %4732 = and i32 %56, 24, !dbg !216 + %4733 = shl nuw nsw i32 %4732, 4, !dbg !216 + %4734 = shl nuw nsw i32 %56, 2, !dbg !216 + %4735 = and i32 %4734, 16, !dbg !216 + %4736 = or disjoint i32 %4729, %4735, !dbg !216 + %4737 = or disjoint i32 %4731, %4733, !dbg !216 + %4738 = or disjoint i32 %4736, %4737, !dbg !216 + %4739 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4738, !dbg !216 + %4740 = bitcast <2 x bfloat> %4604 to i32, !dbg !216 + %4741 = bitcast <2 x bfloat> %4612 to i32, !dbg !216 + %4742 = bitcast <2 x bfloat> %4620 to i32, !dbg !216 + %4743 = bitcast <2 x bfloat> %4628 to i32, !dbg !216 + %4744 = insertelement <4 x i32> poison, i32 %4740, i64 0, !dbg !216 + %4745 = insertelement <4 x i32> %4744, i32 %4741, i64 1, !dbg !216 + %4746 = insertelement <4 x i32> %4745, i32 %4742, i64 2, !dbg !216 + %4747 = insertelement <4 x i32> %4746, i32 %4743, i64 3, !dbg !216 + store <4 x i32> %4747, ptr addrspace(3) %4739, align 16, !dbg !216 + %4748 = getelementptr inbounds nuw i8, ptr addrspace(3) %4739, i32 512, !dbg !216 + %4749 = bitcast <2 x bfloat> %4608 to i32, !dbg !216 + %4750 = bitcast <2 x bfloat> %4616 to i32, !dbg !216 + %4751 = bitcast <2 x bfloat> %4624 to i32, !dbg !216 + %4752 = bitcast <2 x bfloat> %4632 to i32, !dbg !216 + %4753 = insertelement <4 x i32> poison, i32 %4749, i64 0, !dbg !216 + %4754 = insertelement <4 x i32> %4753, i32 %4750, i64 1, !dbg !216 + %4755 = insertelement <4 x i32> %4754, i32 %4751, i64 2, !dbg !216 + %4756 = insertelement <4 x i32> %4755, i32 %4752, i64 3, !dbg !216 + store <4 x i32> %4756, ptr addrspace(3) %4748, align 16, !dbg !216 + %4757 = xor i32 %4738, 32, !dbg !216 + %4758 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4757, !dbg !216 + %4759 = bitcast <2 x bfloat> %4636 to i32, !dbg !216 + %4760 = bitcast <2 x bfloat> %4644 to i32, !dbg !216 + %4761 = bitcast <2 x bfloat> %4652 to i32, !dbg !216 + %4762 = bitcast <2 x bfloat> %4660 to i32, !dbg !216 + %4763 = insertelement <4 x i32> poison, i32 %4759, i64 0, !dbg !216 + %4764 = insertelement <4 x i32> %4763, i32 %4760, i64 1, !dbg !216 + %4765 = insertelement <4 x i32> %4764, i32 %4761, i64 2, !dbg !216 + %4766 = insertelement <4 x i32> %4765, i32 %4762, i64 3, !dbg !216 + store <4 x i32> %4766, ptr addrspace(3) %4758, align 16, !dbg !216 + %4767 = getelementptr inbounds nuw i8, ptr addrspace(3) %4758, i32 512, !dbg !216 + %4768 = bitcast <2 x bfloat> %4640 to i32, !dbg !216 + %4769 = bitcast <2 x bfloat> %4648 to i32, !dbg !216 + %4770 = bitcast <2 x bfloat> %4656 to i32, !dbg !216 + %4771 = bitcast <2 x bfloat> %4664 to i32, !dbg !216 + %4772 = insertelement <4 x i32> poison, i32 %4768, i64 0, !dbg !216 + %4773 = insertelement <4 x i32> %4772, i32 %4769, i64 1, !dbg !216 + %4774 = insertelement <4 x i32> %4773, i32 %4770, i64 2, !dbg !216 + %4775 = insertelement <4 x i32> %4774, i32 %4771, i64 3, !dbg !216 + store <4 x i32> %4775, ptr addrspace(3) %4767, align 16, !dbg !216 + %4776 = xor i32 %4738, 64, !dbg !216 + %4777 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4776, !dbg !216 + %4778 = bitcast <2 x bfloat> %4668 to i32, !dbg !216 + %4779 = bitcast <2 x bfloat> %4676 to i32, !dbg !216 + %4780 = bitcast <2 x bfloat> %4684 to i32, !dbg !216 + %4781 = bitcast <2 x bfloat> %4692 to i32, !dbg !216 + %4782 = insertelement <4 x i32> poison, i32 %4778, i64 0, !dbg !216 + %4783 = insertelement <4 x i32> %4782, i32 %4779, i64 1, !dbg !216 + %4784 = insertelement <4 x i32> %4783, i32 %4780, i64 2, !dbg !216 + %4785 = insertelement <4 x i32> %4784, i32 %4781, i64 3, !dbg !216 + store <4 x i32> %4785, ptr addrspace(3) %4777, align 16, !dbg !216 + %4786 = getelementptr inbounds nuw i8, ptr addrspace(3) %4777, i32 512, !dbg !216 + %4787 = bitcast <2 x bfloat> %4672 to i32, !dbg !216 + %4788 = bitcast <2 x bfloat> %4680 to i32, !dbg !216 + %4789 = bitcast <2 x bfloat> %4688 to i32, !dbg !216 + %4790 = bitcast <2 x bfloat> %4696 to i32, !dbg !216 + %4791 = insertelement <4 x i32> poison, i32 %4787, i64 0, !dbg !216 + %4792 = insertelement <4 x i32> %4791, i32 %4788, i64 1, !dbg !216 + %4793 = insertelement <4 x i32> %4792, i32 %4789, i64 2, !dbg !216 + %4794 = insertelement <4 x i32> %4793, i32 %4790, i64 3, !dbg !216 + store <4 x i32> %4794, ptr addrspace(3) %4786, align 16, !dbg !216 + %4795 = xor i32 %4738, 96, !dbg !216 + %4796 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4795, !dbg !216 + %4797 = bitcast <2 x bfloat> %4700 to i32, !dbg !216 + %4798 = bitcast <2 x bfloat> %4708 to i32, !dbg !216 + %4799 = bitcast <2 x bfloat> %4716 to i32, !dbg !216 + %4800 = bitcast <2 x bfloat> %4724 to i32, !dbg !216 + %4801 = insertelement <4 x i32> poison, i32 %4797, i64 0, !dbg !216 + %4802 = insertelement <4 x i32> %4801, i32 %4798, i64 1, !dbg !216 + %4803 = insertelement <4 x i32> %4802, i32 %4799, i64 2, !dbg !216 + %4804 = insertelement <4 x i32> %4803, i32 %4800, i64 3, !dbg !216 + store <4 x i32> %4804, ptr addrspace(3) %4796, align 16, !dbg !216 + %4805 = getelementptr inbounds nuw i8, ptr addrspace(3) %4796, i32 512, !dbg !216 + %4806 = bitcast <2 x bfloat> %4704 to i32, !dbg !216 + %4807 = bitcast <2 x bfloat> %4712 to i32, !dbg !216 + %4808 = bitcast <2 x bfloat> %4720 to i32, !dbg !216 + %4809 = bitcast <2 x bfloat> %4728 to i32, !dbg !216 + %4810 = insertelement <4 x i32> poison, i32 %4806, i64 0, !dbg !216 + %4811 = insertelement <4 x i32> %4810, i32 %4807, i64 1, !dbg !216 + %4812 = insertelement <4 x i32> %4811, i32 %4808, i64 2, !dbg !216 + %4813 = insertelement <4 x i32> %4812, i32 %4809, i64 3, !dbg !216 + store <4 x i32> %4813, ptr addrspace(3) %4805, align 16, !dbg !216 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !216 + %4814 = shl nuw nsw i32 %4732, 10, !dbg !216 + %4815 = shl nuw nsw i32 %390, 5, !dbg !216 + %4816 = and i32 %4734, 1008, !dbg !216 + %4817 = or disjoint i32 %4814, %4815, !dbg !216 + %4818 = xor i32 %4817, %4816, !dbg !216 + %4819 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4818, !dbg !216 + %4820 = ptrtoint ptr addrspace(3) %4819 to i32, !dbg !216 + %4821 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4820) #3, !dbg !216 + %4822 = extractvalue { i32, i32, i32, i32 } %4821, 0, !dbg !216 + %4823 = extractvalue { i32, i32, i32, i32 } %4821, 1, !dbg !216 + %4824 = extractvalue { i32, i32, i32, i32 } %4821, 2, !dbg !216 + %4825 = extractvalue { i32, i32, i32, i32 } %4821, 3, !dbg !216 + %4826 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 1024, !dbg !216 + %4827 = ptrtoint ptr addrspace(3) %4826 to i32, !dbg !216 + %4828 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4827) #3, !dbg !216 + %4829 = extractvalue { i32, i32, i32, i32 } %4828, 0, !dbg !216 + %4830 = extractvalue { i32, i32, i32, i32 } %4828, 1, !dbg !216 + %4831 = extractvalue { i32, i32, i32, i32 } %4828, 2, !dbg !216 + %4832 = extractvalue { i32, i32, i32, i32 } %4828, 3, !dbg !216 + %4833 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 2048, !dbg !216 + %4834 = ptrtoint ptr addrspace(3) %4833 to i32, !dbg !216 + %4835 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4834) #3, !dbg !216 + %4836 = extractvalue { i32, i32, i32, i32 } %4835, 0, !dbg !216 + %4837 = extractvalue { i32, i32, i32, i32 } %4835, 1, !dbg !216 + %4838 = extractvalue { i32, i32, i32, i32 } %4835, 2, !dbg !216 + %4839 = extractvalue { i32, i32, i32, i32 } %4835, 3, !dbg !216 + %4840 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 3072, !dbg !216 + %4841 = ptrtoint ptr addrspace(3) %4840 to i32, !dbg !216 + %4842 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4841) #3, !dbg !216 + %4843 = extractvalue { i32, i32, i32, i32 } %4842, 0, !dbg !216 + %4844 = extractvalue { i32, i32, i32, i32 } %4842, 1, !dbg !216 + %4845 = extractvalue { i32, i32, i32, i32 } %4842, 2, !dbg !216 + %4846 = extractvalue { i32, i32, i32, i32 } %4842, 3, !dbg !216 + %4847 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 4096, !dbg !216 + %4848 = ptrtoint ptr addrspace(3) %4847 to i32, !dbg !216 + %4849 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4848) #3, !dbg !216 + %4850 = extractvalue { i32, i32, i32, i32 } %4849, 0, !dbg !216 + %4851 = extractvalue { i32, i32, i32, i32 } %4849, 1, !dbg !216 + %4852 = extractvalue { i32, i32, i32, i32 } %4849, 2, !dbg !216 + %4853 = extractvalue { i32, i32, i32, i32 } %4849, 3, !dbg !216 + %4854 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 5120, !dbg !216 + %4855 = ptrtoint ptr addrspace(3) %4854 to i32, !dbg !216 + %4856 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4855) #3, !dbg !216 + %4857 = extractvalue { i32, i32, i32, i32 } %4856, 0, !dbg !216 + %4858 = extractvalue { i32, i32, i32, i32 } %4856, 1, !dbg !216 + %4859 = extractvalue { i32, i32, i32, i32 } %4856, 2, !dbg !216 + %4860 = extractvalue { i32, i32, i32, i32 } %4856, 3, !dbg !216 + %4861 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 6144, !dbg !216 + %4862 = ptrtoint ptr addrspace(3) %4861 to i32, !dbg !216 + %4863 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4862) #3, !dbg !216 + %4864 = extractvalue { i32, i32, i32, i32 } %4863, 0, !dbg !216 + %4865 = extractvalue { i32, i32, i32, i32 } %4863, 1, !dbg !216 + %4866 = extractvalue { i32, i32, i32, i32 } %4863, 2, !dbg !216 + %4867 = extractvalue { i32, i32, i32, i32 } %4863, 3, !dbg !216 + %4868 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 7168, !dbg !216 + %4869 = ptrtoint ptr addrspace(3) %4868 to i32, !dbg !216 + %4870 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4869) #3, !dbg !216 + %4871 = extractvalue { i32, i32, i32, i32 } %4870, 0, !dbg !216 + %4872 = extractvalue { i32, i32, i32, i32 } %4870, 1, !dbg !216 + %4873 = extractvalue { i32, i32, i32, i32 } %4870, 2, !dbg !216 + %4874 = extractvalue { i32, i32, i32, i32 } %4870, 3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4822, i32 %4823, i32 %4824, i32 %4825, ptr addrspace(1) %4529, i1 %147) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4829, i32 %4830, i32 %4831, i32 %4832, ptr addrspace(1) %4530, i1 %148) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4836, i32 %4837, i32 %4838, i32 %4839, ptr addrspace(1) %4531, i1 %149) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4843, i32 %4844, i32 %4845, i32 %4846, ptr addrspace(1) %4532, i1 %150) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4850, i32 %4851, i32 %4852, i32 %4853, ptr addrspace(1) %4533, i1 %151) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4857, i32 %4858, i32 %4859, i32 %4860, ptr addrspace(1) %4534, i1 %152) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4864, i32 %4865, i32 %4866, i32 %4867, ptr addrspace(1) %4535, i1 %153) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4871, i32 %4872, i32 %4873, i32 %4874, ptr addrspace(1) %4536, i1 %154) #3, !dbg !216 + br label %11899, !dbg !39 + +4875: ; preds = %29 + %4876 = shl nuw nsw i32 %40, 7, !dbg !217 + %4877 = or disjoint i32 %59, %4876, !dbg !218 + %4878 = or disjoint i32 %60, %4876, !dbg !218 + %4879 = or disjoint i32 %61, %4876, !dbg !218 + %4880 = or disjoint i32 %62, %4876, !dbg !218 + %4881 = or disjoint i32 %63, %4876, !dbg !218 + %4882 = or disjoint i32 %64, %4876, !dbg !218 + %4883 = or disjoint i32 %65, %4876, !dbg !218 + %4884 = or disjoint i32 %66, %4876, !dbg !218 + %4885 = shl i32 %4877, 7, !dbg !219 + %4886 = shl i32 %4878, 7, !dbg !219 + %4887 = shl i32 %4879, 7, !dbg !219 + %4888 = shl i32 %4880, 7, !dbg !219 + %4889 = shl i32 %4881, 7, !dbg !219 + %4890 = shl i32 %4882, 7, !dbg !219 + %4891 = shl i32 %4883, 7, !dbg !219 + %4892 = shl i32 %4884, 7, !dbg !219 + %4893 = sext i32 %4885 to i64, !dbg !221 + %4894 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4893, !dbg !221 + %4895 = sext i32 %4886 to i64, !dbg !221 + %4896 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4895, !dbg !221 + %4897 = sext i32 %4887 to i64, !dbg !221 + %4898 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4897, !dbg !221 + %4899 = sext i32 %4888 to i64, !dbg !221 + %4900 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4899, !dbg !221 + %4901 = sext i32 %4889 to i64, !dbg !221 + %4902 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4901, !dbg !221 + %4903 = sext i32 %4890 to i64, !dbg !221 + %4904 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4903, !dbg !221 + %4905 = sext i32 %4891 to i64, !dbg !221 + %4906 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4905, !dbg !221 + %4907 = sext i32 %4892 to i64, !dbg !221 + %4908 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4907, !dbg !221 + %4909 = shl nuw nsw i32 %56, 3, !dbg !222 + %4910 = and i32 %4909, 120, !dbg !222 + %4911 = zext nneg i32 %4910 to i64, !dbg !223 + %4912 = getelementptr bfloat, ptr addrspace(1) %4894, i64 %4911, !dbg !223 + %4913 = getelementptr bfloat, ptr addrspace(1) %4896, i64 %4911, !dbg !223 + %4914 = getelementptr bfloat, ptr addrspace(1) %4898, i64 %4911, !dbg !223 + %4915 = getelementptr bfloat, ptr addrspace(1) %4900, i64 %4911, !dbg !223 + %4916 = getelementptr bfloat, ptr addrspace(1) %4902, i64 %4911, !dbg !223 + %4917 = getelementptr bfloat, ptr addrspace(1) %4904, i64 %4911, !dbg !223 + %4918 = getelementptr bfloat, ptr addrspace(1) %4906, i64 %4911, !dbg !223 + %4919 = getelementptr bfloat, ptr addrspace(1) %4908, i64 %4911, !dbg !223 + %4920 = icmp slt i32 %4877, %19, !dbg !224 + %4921 = icmp slt i32 %4878, %19, !dbg !224 + %4922 = icmp slt i32 %4879, %19, !dbg !224 + %4923 = icmp slt i32 %4880, %19, !dbg !224 + %4924 = icmp slt i32 %4881, %19, !dbg !224 + %4925 = icmp slt i32 %4882, %19, !dbg !224 + %4926 = icmp slt i32 %4883, %19, !dbg !224 + %4927 = icmp slt i32 %4884, %19, !dbg !224 + %4928 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4912, i1 %4920) #3, !dbg !225 + %4929 = extractvalue { i32, i32, i32, i32 } %4928, 0, !dbg !225 + %4930 = extractvalue { i32, i32, i32, i32 } %4928, 1, !dbg !225 + %4931 = extractvalue { i32, i32, i32, i32 } %4928, 2, !dbg !225 + %4932 = extractvalue { i32, i32, i32, i32 } %4928, 3, !dbg !225 + %4933 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4913, i1 %4921) #3, !dbg !225 + %4934 = extractvalue { i32, i32, i32, i32 } %4933, 0, !dbg !225 + %4935 = extractvalue { i32, i32, i32, i32 } %4933, 1, !dbg !225 + %4936 = extractvalue { i32, i32, i32, i32 } %4933, 2, !dbg !225 + %4937 = extractvalue { i32, i32, i32, i32 } %4933, 3, !dbg !225 + %4938 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4914, i1 %4922) #3, !dbg !225 + %4939 = extractvalue { i32, i32, i32, i32 } %4938, 0, !dbg !225 + %4940 = extractvalue { i32, i32, i32, i32 } %4938, 1, !dbg !225 + %4941 = extractvalue { i32, i32, i32, i32 } %4938, 2, !dbg !225 + %4942 = extractvalue { i32, i32, i32, i32 } %4938, 3, !dbg !225 + %4943 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4915, i1 %4923) #3, !dbg !225 + %4944 = extractvalue { i32, i32, i32, i32 } %4943, 0, !dbg !225 + %4945 = extractvalue { i32, i32, i32, i32 } %4943, 1, !dbg !225 + %4946 = extractvalue { i32, i32, i32, i32 } %4943, 2, !dbg !225 + %4947 = extractvalue { i32, i32, i32, i32 } %4943, 3, !dbg !225 + %4948 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4916, i1 %4924) #3, !dbg !225 + %4949 = extractvalue { i32, i32, i32, i32 } %4948, 0, !dbg !225 + %4950 = extractvalue { i32, i32, i32, i32 } %4948, 1, !dbg !225 + %4951 = extractvalue { i32, i32, i32, i32 } %4948, 2, !dbg !225 + %4952 = extractvalue { i32, i32, i32, i32 } %4948, 3, !dbg !225 + %4953 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4917, i1 %4925) #3, !dbg !225 + %4954 = extractvalue { i32, i32, i32, i32 } %4953, 0, !dbg !225 + %4955 = extractvalue { i32, i32, i32, i32 } %4953, 1, !dbg !225 + %4956 = extractvalue { i32, i32, i32, i32 } %4953, 2, !dbg !225 + %4957 = extractvalue { i32, i32, i32, i32 } %4953, 3, !dbg !225 + %4958 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4918, i1 %4926) #3, !dbg !225 + %4959 = extractvalue { i32, i32, i32, i32 } %4958, 0, !dbg !225 + %4960 = extractvalue { i32, i32, i32, i32 } %4958, 1, !dbg !225 + %4961 = extractvalue { i32, i32, i32, i32 } %4958, 2, !dbg !225 + %4962 = extractvalue { i32, i32, i32, i32 } %4958, 3, !dbg !225 + %4963 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4919, i1 %4927) #3, !dbg !225 + %4964 = extractvalue { i32, i32, i32, i32 } %4963, 0, !dbg !225 + %4965 = extractvalue { i32, i32, i32, i32 } %4963, 1, !dbg !225 + %4966 = extractvalue { i32, i32, i32, i32 } %4963, 2, !dbg !225 + %4967 = extractvalue { i32, i32, i32, i32 } %4963, 3, !dbg !225 + %4968 = shl nuw nsw i32 %56, 4, !dbg !225 + %4969 = and i32 %4968, 112, !dbg !225 + %4970 = shl nuw nsw i32 %58, 3, !dbg !225 + %4971 = and i32 %56, 112, !dbg !225 + %4972 = and i32 %56, 8, !dbg !225 + %4973 = shl nuw nsw i32 %4972, 11, !dbg !225 + %4974 = or disjoint i32 %4969, %4970, !dbg !225 + %4975 = xor i32 %4974, %4971, !dbg !225 + %4976 = or disjoint i32 %4975, %4973, !dbg !225 + %4977 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4976, !dbg !225 + %4978 = insertelement <4 x i32> poison, i32 %4929, i64 0, !dbg !225 + %4979 = insertelement <4 x i32> %4978, i32 %4930, i64 1, !dbg !225 + %4980 = insertelement <4 x i32> %4979, i32 %4931, i64 2, !dbg !225 + %4981 = insertelement <4 x i32> %4980, i32 %4932, i64 3, !dbg !225 + store <4 x i32> %4981, ptr addrspace(3) %4977, align 16, !dbg !225 + %4982 = or disjoint i32 %4976, 2048, !dbg !225 + %4983 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4982, !dbg !225 + %4984 = insertelement <4 x i32> poison, i32 %4934, i64 0, !dbg !225 + %4985 = insertelement <4 x i32> %4984, i32 %4935, i64 1, !dbg !225 + %4986 = insertelement <4 x i32> %4985, i32 %4936, i64 2, !dbg !225 + %4987 = insertelement <4 x i32> %4986, i32 %4937, i64 3, !dbg !225 + store <4 x i32> %4987, ptr addrspace(3) %4983, align 16, !dbg !225 + %4988 = or disjoint i32 %4976, 4096, !dbg !225 + %4989 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4988, !dbg !225 + %4990 = insertelement <4 x i32> poison, i32 %4939, i64 0, !dbg !225 + %4991 = insertelement <4 x i32> %4990, i32 %4940, i64 1, !dbg !225 + %4992 = insertelement <4 x i32> %4991, i32 %4941, i64 2, !dbg !225 + %4993 = insertelement <4 x i32> %4992, i32 %4942, i64 3, !dbg !225 + store <4 x i32> %4993, ptr addrspace(3) %4989, align 16, !dbg !225 + %4994 = or disjoint i32 %4976, 6144, !dbg !225 + %4995 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4994, !dbg !225 + %4996 = insertelement <4 x i32> poison, i32 %4944, i64 0, !dbg !225 + %4997 = insertelement <4 x i32> %4996, i32 %4945, i64 1, !dbg !225 + %4998 = insertelement <4 x i32> %4997, i32 %4946, i64 2, !dbg !225 + %4999 = insertelement <4 x i32> %4998, i32 %4947, i64 3, !dbg !225 + store <4 x i32> %4999, ptr addrspace(3) %4995, align 16, !dbg !225 + %5000 = or disjoint i32 %4976, 8192, !dbg !225 + %5001 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5000, !dbg !225 + %5002 = insertelement <4 x i32> poison, i32 %4949, i64 0, !dbg !225 + %5003 = insertelement <4 x i32> %5002, i32 %4950, i64 1, !dbg !225 + %5004 = insertelement <4 x i32> %5003, i32 %4951, i64 2, !dbg !225 + %5005 = insertelement <4 x i32> %5004, i32 %4952, i64 3, !dbg !225 + store <4 x i32> %5005, ptr addrspace(3) %5001, align 16, !dbg !225 + %5006 = or disjoint i32 %4976, 10240, !dbg !225 + %5007 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5006, !dbg !225 + %5008 = insertelement <4 x i32> poison, i32 %4954, i64 0, !dbg !225 + %5009 = insertelement <4 x i32> %5008, i32 %4955, i64 1, !dbg !225 + %5010 = insertelement <4 x i32> %5009, i32 %4956, i64 2, !dbg !225 + %5011 = insertelement <4 x i32> %5010, i32 %4957, i64 3, !dbg !225 + store <4 x i32> %5011, ptr addrspace(3) %5007, align 16, !dbg !225 + %5012 = or disjoint i32 %4976, 12288, !dbg !225 + %5013 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5012, !dbg !225 + %5014 = insertelement <4 x i32> poison, i32 %4959, i64 0, !dbg !225 + %5015 = insertelement <4 x i32> %5014, i32 %4960, i64 1, !dbg !225 + %5016 = insertelement <4 x i32> %5015, i32 %4961, i64 2, !dbg !225 + %5017 = insertelement <4 x i32> %5016, i32 %4962, i64 3, !dbg !225 + store <4 x i32> %5017, ptr addrspace(3) %5013, align 16, !dbg !225 + %5018 = or disjoint i32 %4976, 14336, !dbg !225 + %5019 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5018, !dbg !225 + %5020 = insertelement <4 x i32> poison, i32 %4964, i64 0, !dbg !225 + %5021 = insertelement <4 x i32> %5020, i32 %4965, i64 1, !dbg !225 + %5022 = insertelement <4 x i32> %5021, i32 %4966, i64 2, !dbg !225 + %5023 = insertelement <4 x i32> %5022, i32 %4967, i64 3, !dbg !225 + store <4 x i32> %5023, ptr addrspace(3) %5019, align 16, !dbg !225 + %5024 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4893, !dbg !226 + %5025 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4895, !dbg !226 + %5026 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4897, !dbg !226 + %5027 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4899, !dbg !226 + %5028 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4901, !dbg !226 + %5029 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4903, !dbg !226 + %5030 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4905, !dbg !226 + %5031 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4907, !dbg !226 + %5032 = getelementptr bfloat, ptr addrspace(1) %5024, i64 %4911, !dbg !228 + %5033 = getelementptr bfloat, ptr addrspace(1) %5025, i64 %4911, !dbg !228 + %5034 = getelementptr bfloat, ptr addrspace(1) %5026, i64 %4911, !dbg !228 + %5035 = getelementptr bfloat, ptr addrspace(1) %5027, i64 %4911, !dbg !228 + %5036 = getelementptr bfloat, ptr addrspace(1) %5028, i64 %4911, !dbg !228 + %5037 = getelementptr bfloat, ptr addrspace(1) %5029, i64 %4911, !dbg !228 + %5038 = getelementptr bfloat, ptr addrspace(1) %5030, i64 %4911, !dbg !228 + %5039 = getelementptr bfloat, ptr addrspace(1) %5031, i64 %4911, !dbg !228 + %5040 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5032, i1 %4920) #3, !dbg !229 + %5041 = extractvalue { i32, i32, i32, i32 } %5040, 0, !dbg !229 + %5042 = extractvalue { i32, i32, i32, i32 } %5040, 1, !dbg !229 + %5043 = extractvalue { i32, i32, i32, i32 } %5040, 2, !dbg !229 + %5044 = extractvalue { i32, i32, i32, i32 } %5040, 3, !dbg !229 + %5045 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5033, i1 %4921) #3, !dbg !229 + %5046 = extractvalue { i32, i32, i32, i32 } %5045, 0, !dbg !229 + %5047 = extractvalue { i32, i32, i32, i32 } %5045, 1, !dbg !229 + %5048 = extractvalue { i32, i32, i32, i32 } %5045, 2, !dbg !229 + %5049 = extractvalue { i32, i32, i32, i32 } %5045, 3, !dbg !229 + %5050 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5034, i1 %4922) #3, !dbg !229 + %5051 = extractvalue { i32, i32, i32, i32 } %5050, 0, !dbg !229 + %5052 = extractvalue { i32, i32, i32, i32 } %5050, 1, !dbg !229 + %5053 = extractvalue { i32, i32, i32, i32 } %5050, 2, !dbg !229 + %5054 = extractvalue { i32, i32, i32, i32 } %5050, 3, !dbg !229 + %5055 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5035, i1 %4923) #3, !dbg !229 + %5056 = extractvalue { i32, i32, i32, i32 } %5055, 0, !dbg !229 + %5057 = extractvalue { i32, i32, i32, i32 } %5055, 1, !dbg !229 + %5058 = extractvalue { i32, i32, i32, i32 } %5055, 2, !dbg !229 + %5059 = extractvalue { i32, i32, i32, i32 } %5055, 3, !dbg !229 + %5060 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5036, i1 %4924) #3, !dbg !229 + %5061 = extractvalue { i32, i32, i32, i32 } %5060, 0, !dbg !229 + %5062 = extractvalue { i32, i32, i32, i32 } %5060, 1, !dbg !229 + %5063 = extractvalue { i32, i32, i32, i32 } %5060, 2, !dbg !229 + %5064 = extractvalue { i32, i32, i32, i32 } %5060, 3, !dbg !229 + %5065 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5037, i1 %4925) #3, !dbg !229 + %5066 = extractvalue { i32, i32, i32, i32 } %5065, 0, !dbg !229 + %5067 = extractvalue { i32, i32, i32, i32 } %5065, 1, !dbg !229 + %5068 = extractvalue { i32, i32, i32, i32 } %5065, 2, !dbg !229 + %5069 = extractvalue { i32, i32, i32, i32 } %5065, 3, !dbg !229 + %5070 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5038, i1 %4926) #3, !dbg !229 + %5071 = extractvalue { i32, i32, i32, i32 } %5070, 0, !dbg !229 + %5072 = extractvalue { i32, i32, i32, i32 } %5070, 1, !dbg !229 + %5073 = extractvalue { i32, i32, i32, i32 } %5070, 2, !dbg !229 + %5074 = extractvalue { i32, i32, i32, i32 } %5070, 3, !dbg !229 + %5075 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5039, i1 %4927) #3, !dbg !229 + %5076 = extractvalue { i32, i32, i32, i32 } %5075, 0, !dbg !229 + %5077 = extractvalue { i32, i32, i32, i32 } %5075, 1, !dbg !229 + %5078 = extractvalue { i32, i32, i32, i32 } %5075, 2, !dbg !229 + %5079 = extractvalue { i32, i32, i32, i32 } %5075, 3, !dbg !229 + %5080 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4976, !dbg !229 + %5081 = insertelement <4 x i32> poison, i32 %5041, i64 0, !dbg !229 + %5082 = insertelement <4 x i32> %5081, i32 %5042, i64 1, !dbg !229 + %5083 = insertelement <4 x i32> %5082, i32 %5043, i64 2, !dbg !229 + %5084 = insertelement <4 x i32> %5083, i32 %5044, i64 3, !dbg !229 + store <4 x i32> %5084, ptr addrspace(3) %5080, align 16, !dbg !229 + %5085 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4982, !dbg !229 + %5086 = insertelement <4 x i32> poison, i32 %5046, i64 0, !dbg !229 + %5087 = insertelement <4 x i32> %5086, i32 %5047, i64 1, !dbg !229 + %5088 = insertelement <4 x i32> %5087, i32 %5048, i64 2, !dbg !229 + %5089 = insertelement <4 x i32> %5088, i32 %5049, i64 3, !dbg !229 + store <4 x i32> %5089, ptr addrspace(3) %5085, align 16, !dbg !229 + %5090 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4988, !dbg !229 + %5091 = insertelement <4 x i32> poison, i32 %5051, i64 0, !dbg !229 + %5092 = insertelement <4 x i32> %5091, i32 %5052, i64 1, !dbg !229 + %5093 = insertelement <4 x i32> %5092, i32 %5053, i64 2, !dbg !229 + %5094 = insertelement <4 x i32> %5093, i32 %5054, i64 3, !dbg !229 + store <4 x i32> %5094, ptr addrspace(3) %5090, align 16, !dbg !229 + %5095 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4994, !dbg !229 + %5096 = insertelement <4 x i32> poison, i32 %5056, i64 0, !dbg !229 + %5097 = insertelement <4 x i32> %5096, i32 %5057, i64 1, !dbg !229 + %5098 = insertelement <4 x i32> %5097, i32 %5058, i64 2, !dbg !229 + %5099 = insertelement <4 x i32> %5098, i32 %5059, i64 3, !dbg !229 + store <4 x i32> %5099, ptr addrspace(3) %5095, align 16, !dbg !229 + %5100 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5000, !dbg !229 + %5101 = insertelement <4 x i32> poison, i32 %5061, i64 0, !dbg !229 + %5102 = insertelement <4 x i32> %5101, i32 %5062, i64 1, !dbg !229 + %5103 = insertelement <4 x i32> %5102, i32 %5063, i64 2, !dbg !229 + %5104 = insertelement <4 x i32> %5103, i32 %5064, i64 3, !dbg !229 + store <4 x i32> %5104, ptr addrspace(3) %5100, align 16, !dbg !229 + %5105 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5006, !dbg !229 + %5106 = insertelement <4 x i32> poison, i32 %5066, i64 0, !dbg !229 + %5107 = insertelement <4 x i32> %5106, i32 %5067, i64 1, !dbg !229 + %5108 = insertelement <4 x i32> %5107, i32 %5068, i64 2, !dbg !229 + %5109 = insertelement <4 x i32> %5108, i32 %5069, i64 3, !dbg !229 + store <4 x i32> %5109, ptr addrspace(3) %5105, align 16, !dbg !229 + %5110 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5012, !dbg !229 + %5111 = insertelement <4 x i32> poison, i32 %5071, i64 0, !dbg !229 + %5112 = insertelement <4 x i32> %5111, i32 %5072, i64 1, !dbg !229 + %5113 = insertelement <4 x i32> %5112, i32 %5073, i64 2, !dbg !229 + %5114 = insertelement <4 x i32> %5113, i32 %5074, i64 3, !dbg !229 + store <4 x i32> %5114, ptr addrspace(3) %5110, align 16, !dbg !229 + %5115 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5018, !dbg !229 + %5116 = insertelement <4 x i32> poison, i32 %5076, i64 0, !dbg !229 + %5117 = insertelement <4 x i32> %5116, i32 %5077, i64 1, !dbg !229 + %5118 = insertelement <4 x i32> %5117, i32 %5078, i64 2, !dbg !229 + %5119 = insertelement <4 x i32> %5118, i32 %5079, i64 3, !dbg !229 + store <4 x i32> %5119, ptr addrspace(3) %5115, align 16, !dbg !229 + %5120 = shl nuw nsw i32 %44, 2, !dbg !230 + %5121 = mul i32 %30, %43, !dbg !231 + %5122 = mul i32 %38, %43, !dbg !232 + %5123 = shl nuw nsw i32 %43, 5, !dbg !233 + %5124 = mul i32 %23, %45, !dbg !234 + %5125 = add i32 %5124, %40, !dbg !235 + %5126 = mul i32 %25, %45, !dbg !236 + %reass.add1528 = add i32 %5126, %40, !dbg !237 + %reass.mul1529 = mul i32 %reass.add1528, %24, !dbg !237 + %5127 = sext i32 %reass.mul1529 to i64, !dbg !238 + %5128 = getelementptr i32, ptr addrspace(1) %11, i64 %5127, !dbg !238 + %5129 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5128, i1 true) #3, !dbg !239 + %5130 = shl i32 %5129, 7, !dbg !240 + %5131 = sext i32 %5125 to i64, !dbg !241 + %5132 = getelementptr i32, ptr addrspace(1) %10, i64 %5131, !dbg !241 + %5133 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5132, i1 true) #3, !dbg !242 + %5134 = and i32 %56, 3, !dbg !243 + %5135 = shl nuw nsw i32 %5134, 1, !dbg !243 + %5136 = or disjoint i32 %5135, 1, !dbg !243 + %5137 = or disjoint i32 %5135, 8, !dbg !243 + %5138 = or disjoint i32 %5135, 9, !dbg !243 + %5139 = or disjoint i32 %5130, %5137, !dbg !244 + %5140 = or disjoint i32 %5130, %5138, !dbg !244 + %5141 = insertelement <4 x i32> poison, i32 %5135, i64 0, !dbg !243 + %5142 = shufflevector <4 x i32> %5141, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !243 + %5143 = or disjoint <4 x i32> %5142, , !dbg !243 + %5144 = insertelement <4 x i32> poison, i32 %5130, i64 0, !dbg !244 + %5145 = shufflevector <4 x i32> %5144, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !244 + %5146 = or disjoint <4 x i32> %5145, %5143, !dbg !244 + %5147 = or disjoint i32 %5130, %59, !dbg !244 + %5148 = or disjoint i32 %5130, %60, !dbg !244 + %5149 = or disjoint i32 %5130, %61, !dbg !244 + %5150 = or disjoint i32 %5130, %62, !dbg !244 + %5151 = shl i32 %5147, 12, !dbg !245 + %5152 = shl i32 %5148, 12, !dbg !245 + %5153 = shl i32 %5149, 12, !dbg !245 + %5154 = shl i32 %5150, 12, !dbg !245 + %5155 = shl i32 %5147, 7, !dbg !247 + %5156 = shl i32 %5148, 7, !dbg !247 + %5157 = shl i32 %5149, 7, !dbg !247 + %5158 = shl i32 %5150, 7, !dbg !247 + %5159 = shl i32 %5133, 1, !dbg !248 + %5160 = add i32 %18, 63, !dbg !249 + %5161 = sdiv i32 %5160, 64, !dbg !250 + %5162 = tail call i32 @llvm.smax.i32(i32 %5161, i32 1), !dbg !251 + %5163 = tail call i32 @llvm.smin.i32(i32 %5159, i32 %5162), !dbg !252 + %5164 = insertelement <2 x i32> poison, i32 %72, i64 0, !dbg !218 + %5165 = insertelement <2 x i32> %5164, i32 %71, i64 1, !dbg !218 + %5166 = insertelement <2 x i32> poison, i32 %4876, i64 0, !dbg !218 + %5167 = shufflevector <2 x i32> %5166, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !218 + %5168 = or disjoint <2 x i32> %5165, %5167, !dbg !218 + %5169 = insertelement <8 x i32> poison, i32 %5135, i64 0, !dbg !243 + %5170 = shufflevector <8 x i32> %5169, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !243 + %5171 = or disjoint <8 x i32> %5170, , !dbg !243 + %5172 = insertelement <8 x i32> poison, i32 %5130, i64 0, !dbg !244 + %5173 = shufflevector <8 x i32> %5172, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !244 + %5174 = or disjoint <8 x i32> %5173, %5171, !dbg !244 + %5175 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !253 + %5176 = shufflevector <2 x i32> %5175, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !253 + %5177 = srem <2 x i32> %5168, %5176, !dbg !253 + %5178 = shufflevector <2 x i32> %5177, <2 x i32> poison, <16 x i32> , !dbg !253 + %5179 = zext nneg i32 %43 to i64, !dbg !254 + %5180 = getelementptr i64, ptr addrspace(1) %16, i64 %5179, !dbg !254 + %5181 = icmp sgt i32 %5159, 0, !dbg !255 + %5182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %5180, i1 %5181) #3, !dbg !256 + %5183 = extractelement <2 x i32> %5177, i64 1, !dbg !257 + %5184 = icmp sge i32 %5183, %26, !dbg !258 + %5185 = extractelement <2 x i32> %5177, i64 0, !dbg !257 + %5186 = icmp sge i32 %5185, %26, !dbg !258 + %5187 = srem i32 %5183, %26, !dbg !257 + %5188 = srem i32 %5185, %26, !dbg !257 + %.not59 = icmp eq i32 %5187, 0, !dbg !259 + %.not60 = icmp eq i32 %5188, 0, !dbg !259 + %5189 = tail call i32 @llvm.smin.i32(i32 %26, i32 0), !dbg !260 + %5190 = select i1 %.not59, i32 0, i32 %5189, !dbg !260 + %5191 = add nsw i32 %5190, %5187, !dbg !260 + %5192 = select i1 %.not60, i32 0, i32 %5189, !dbg !260 + %5193 = add nsw i32 %5192, %5188, !dbg !260 + %5194 = sext i32 %5191 to i64, !dbg !261 + %5195 = sext i32 %5193 to i64, !dbg !261 + %5196 = icmp sgt i64 %5182, %5194, !dbg !262 + %5197 = icmp sgt i64 %5182, %5195, !dbg !262 + %5198 = and i1 %5184, %5196, !dbg !263 + %5199 = and i1 %5186, %5197, !dbg !263 + %5200 = getelementptr i32, ptr addrspace(1) %15, i64 %5127, !dbg !264 + %5201 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5200, i1 true) #3, !dbg !265 + %5202 = shl i32 %5201, 7, !dbg !266 + %5203 = getelementptr i32, ptr addrspace(1) %14, i64 %5131, !dbg !267 + %5204 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5203, i1 true) #3, !dbg !268 + %5205 = or disjoint i32 %5202, %5135, !dbg !269 + %5206 = or disjoint i32 %5202, %5136, !dbg !269 + %5207 = or disjoint i32 %5202, %5137, !dbg !269 + %5208 = or disjoint i32 %5202, %5138, !dbg !269 + %5209 = extractelement <4 x i32> %5143, i64 3, !dbg !269 + %5210 = or disjoint i32 %5202, %5209, !dbg !269 + %5211 = extractelement <4 x i32> %5143, i64 2, !dbg !269 + %5212 = or disjoint i32 %5202, %5211, !dbg !269 + %5213 = extractelement <4 x i32> %5143, i64 1, !dbg !269 + %5214 = or disjoint i32 %5202, %5213, !dbg !269 + %5215 = extractelement <4 x i32> %5143, i64 0, !dbg !269 + %5216 = or disjoint i32 %5202, %5215, !dbg !269 + %5217 = extractelement <8 x i32> %5171, i64 7, !dbg !269 + %5218 = or disjoint i32 %5202, %5217, !dbg !269 + %5219 = extractelement <8 x i32> %5171, i64 6, !dbg !269 + %5220 = or disjoint i32 %5202, %5219, !dbg !269 + %5221 = extractelement <8 x i32> %5171, i64 5, !dbg !269 + %5222 = or disjoint i32 %5202, %5221, !dbg !269 + %5223 = extractelement <8 x i32> %5171, i64 4, !dbg !269 + %5224 = or disjoint i32 %5202, %5223, !dbg !269 + %5225 = extractelement <8 x i32> %5171, i64 3, !dbg !269 + %5226 = or disjoint i32 %5202, %5225, !dbg !269 + %5227 = extractelement <8 x i32> %5171, i64 2, !dbg !269 + %5228 = or disjoint i32 %5202, %5227, !dbg !269 + %5229 = extractelement <8 x i32> %5171, i64 1, !dbg !269 + %5230 = or disjoint i32 %5202, %5229, !dbg !269 + %5231 = extractelement <8 x i32> %5171, i64 0, !dbg !269 + %5232 = or disjoint i32 %5202, %5231, !dbg !269 + %5233 = or disjoint i32 %5202, %59, !dbg !269 + %5234 = or disjoint i32 %5202, %60, !dbg !269 + %5235 = or disjoint i32 %5202, %61, !dbg !269 + %5236 = or disjoint i32 %5202, %62, !dbg !269 + %5237 = shl i32 %5233, 12, !dbg !270 + %5238 = shl i32 %5234, 12, !dbg !270 + %5239 = shl i32 %5235, 12, !dbg !270 + %5240 = shl i32 %5236, 12, !dbg !270 + %5241 = shl i32 %5233, 7, !dbg !272 + %5242 = shl i32 %5234, 7, !dbg !272 + %5243 = shl i32 %5235, 7, !dbg !272 + %5244 = shl i32 %5236, 7, !dbg !272 + %5245 = shl i32 %5204, 1, !dbg !273 + %5246 = tail call i32 @llvm.smin.i32(i32 %5245, i32 %5162), !dbg !274 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !275 + %5247 = sext i32 %5151 to i64 + %5248 = sext i32 %5152 to i64 + %5249 = sext i32 %5153 to i64 + %5250 = sext i32 %5154 to i64 + %5251 = sext i32 %5155 to i64 + %5252 = sext i32 %5156 to i64 + %5253 = sext i32 %5157 to i64 + %5254 = sext i32 %5158 to i64 + %5255 = icmp slt i32 %5147, %18 + %5256 = icmp slt i32 %5148, %18 + %5257 = icmp slt i32 %5149, %18 + %5258 = icmp slt i32 %5150, %18 + %5259 = and i1 %5181, %5255 + %5260 = and i1 %5181, %5256 + %5261 = and i1 %5181, %5257 + %5262 = and i1 %5181, %5258 + %5263 = shl nuw nsw i32 %4972, 10 + %5264 = or disjoint i32 %4975, %5263 + %5265 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5264 + %5266 = select i1 %5259, i32 16, i32 0 + %5267 = or disjoint i32 %5264, 2048 + %5268 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5267 + %5269 = select i1 %5260, i32 16, i32 0 + %5270 = or disjoint i32 %5264, 4096 + %5271 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5270 + %5272 = select i1 %5261, i32 16, i32 0 + %5273 = or disjoint i32 %5264, 6144 + %5274 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5273 + %5275 = select i1 %5262, i32 16, i32 0 + %5276 = icmp slt i32 %5139, %18 + %5277 = icmp slt i32 %5140, %18 + %5278 = extractelement <4 x i32> %5146, i64 3 + %5279 = icmp slt i32 %5278, %18 + %5280 = extractelement <4 x i32> %5146, i64 2 + %5281 = icmp slt i32 %5280, %18 + %5282 = extractelement <4 x i32> %5146, i64 1 + %5283 = icmp slt i32 %5282, %18 + %5284 = extractelement <4 x i32> %5146, i64 0 + %5285 = icmp slt i32 %5284, %18 + %5286 = extractelement <8 x i32> %5174, i64 7 + %5287 = icmp slt i32 %5286, %18 + %5288 = extractelement <8 x i32> %5174, i64 6 + %5289 = icmp slt i32 %5288, %18 + %5290 = extractelement <8 x i32> %5174, i64 5 + %5291 = icmp slt i32 %5290, %18 + %5292 = extractelement <8 x i32> %5174, i64 4 + %5293 = icmp slt i32 %5292, %18 + %5294 = extractelement <8 x i32> %5174, i64 3 + %5295 = icmp slt i32 %5294, %18 + %5296 = extractelement <8 x i32> %5174, i64 2 + %5297 = icmp slt i32 %5296, %18 + %5298 = extractelement <8 x i32> %5174, i64 1 + %5299 = icmp slt i32 %5298, %18 + %5300 = extractelement <8 x i32> %5174, i64 0 + %5301 = icmp slt i32 %5300, %18 + %5302 = sext i32 %5139 to i64 + %5303 = sext i32 %5140 to i64 + %5304 = sext i32 %5278 to i64 + %5305 = sext i32 %5280 to i64 + %5306 = sext i32 %5282 to i64 + %5307 = sext i32 %5284 to i64 + %5308 = sext i32 %5286 to i64 + %5309 = sext i32 %5288 to i64 + %5310 = sext i32 %5290 to i64 + %5311 = sext i32 %5292 to i64 + %5312 = sext i32 %5294 to i64 + %5313 = sext i32 %5296 to i64 + %5314 = sext i32 %5298 to i64 + %5315 = sext i32 %5300 to i64 + %5316 = and i1 %5181, %5276 + %5317 = and i1 %5181, %5277 + %5318 = and i1 %5181, %5279 + %5319 = and i1 %5181, %5281 + %5320 = and i1 %5181, %5283 + %5321 = and i1 %5181, %5285 + %5322 = and i1 %5181, %5287 + %5323 = and i1 %5181, %5289 + %5324 = and i1 %5181, %5291 + %5325 = and i1 %5181, %5293 + %5326 = and i1 %5181, %5295 + %5327 = and i1 %5181, %5297 + %5328 = and i1 %5181, %5299 + %5329 = and i1 %5181, %5301 + %5330 = and i32 %56, 252 + %5331 = icmp eq i32 %5330, 0 + %5332 = shl nuw nsw i32 %5134, 3 + %5333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5332 + %5334 = or disjoint i32 %5332, 4 + %5335 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5334 + %5336 = or disjoint i32 %5332, 32 + %5337 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5336 + %5338 = select i1 %5316, i32 4, i32 0 + %5339 = or disjoint i32 %5332, 36 + %5340 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5339 + %5341 = select i1 %5317, i32 4, i32 0 + %5342 = or disjoint i32 %5332, 64 + %5343 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5342 + %5344 = select i1 %5318, i32 4, i32 0 + %5345 = or disjoint i32 %5332, 68 + %5346 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5345 + %5347 = select i1 %5319, i32 4, i32 0 + %5348 = or disjoint i32 %5332, 96 + %5349 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5348 + %5350 = select i1 %5320, i32 4, i32 0 + %5351 = or disjoint i32 %5332, 100 + %5352 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5351 + %5353 = select i1 %5321, i32 4, i32 0 + %5354 = or disjoint i32 %5332, 128 + %5355 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5354 + %5356 = select i1 %5322, i32 4, i32 0 + %5357 = or disjoint i32 %5332, 132 + %5358 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5357 + %5359 = select i1 %5323, i32 4, i32 0 + %5360 = or disjoint i32 %5332, 160 + %5361 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5360 + %5362 = select i1 %5324, i32 4, i32 0 + %5363 = or disjoint i32 %5332, 164 + %5364 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5363 + %5365 = select i1 %5325, i32 4, i32 0 + %5366 = or disjoint i32 %5332, 192 + %5367 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5366 + %5368 = select i1 %5326, i32 4, i32 0 + %5369 = or disjoint i32 %5332, 196 + %5370 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5369 + %5371 = select i1 %5327, i32 4, i32 0 + %5372 = or disjoint i32 %5332, 224 + %5373 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5372 + %5374 = select i1 %5328, i32 4, i32 0 + %5375 = or disjoint i32 %5332, 228 + %5376 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5375 + %5377 = select i1 %5329, i32 4, i32 0 + %5378 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5264 + %5379 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5267 + %5380 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5270 + %5381 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5273 + %5382 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5332 + %5383 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5334 + %5384 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5336 + %5385 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5339 + %5386 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5342 + %5387 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5345 + %5388 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5348 + %5389 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5351 + %5390 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5354 + %5391 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5357 + %5392 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5360 + %5393 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5363 + %5394 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5366 + %5395 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5369 + %5396 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5372 + %5397 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5375 + %5398 = icmp sgt i32 %5163, 1 + %5399 = insertelement <2 x i32> poison, i32 %5130, i64 0, !dbg !244 + %5400 = shufflevector <2 x i32> %5399, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !244 + %5401 = insertelement <2 x i32> poison, i32 %5135, i64 0, !dbg !244 + %5402 = insertelement <2 x i32> %5401, i32 %5136, i64 1, !dbg !244 + %5403 = or disjoint <2 x i32> %5400, %5402, !dbg !244 + %5404 = extractelement <2 x i32> %5403, i64 0 + %5405 = icmp slt i32 %5404, %18 + %5406 = extractelement <2 x i32> %5403, i64 1 + %5407 = icmp slt i32 %5406, %18 + %5408 = sext i32 %5404 to i64 + %5409 = sext i32 %5406 to i64 + %5410 = and i1 %5181, %5405 + %5411 = and i1 %5181, %5407 + %5412 = select i1 %5410, i32 4, i32 0 + %5413 = select i1 %5411, i32 4, i32 0 + %5414 = or disjoint <2 x i32> %5403, splat (i32 64) + %5415 = or disjoint i32 %5139, 64 + %5416 = or disjoint i32 %5140, 64 + %5417 = or disjoint i32 %5278, 64 + %5418 = or disjoint i32 %5280, 64 + %5419 = or disjoint i32 %5282, 64 + %5420 = or disjoint i32 %5284, 64 + %5421 = or disjoint i32 %5286, 64 + %5422 = or disjoint i32 %5288, 64 + %5423 = or disjoint i32 %5290, 64 + %5424 = or disjoint i32 %5292, 64 + %5425 = or disjoint i32 %5294, 64 + %5426 = or disjoint i32 %5296, 64 + %5427 = or disjoint i32 %5298, 64 + %5428 = or disjoint i32 %5300, 64 + %5429 = or disjoint i32 %5147, 64 + %5430 = or disjoint i32 %5148, 64 + %5431 = or disjoint i32 %5149, 64 + %5432 = or disjoint i32 %5150, 64 + %5433 = icmp slt i32 %5429, %18 + %5434 = icmp slt i32 %5430, %18 + %5435 = icmp slt i32 %5431, %18 + %5436 = icmp slt i32 %5432, %18 + %5437 = and i1 %5398, %5433 + %5438 = and i1 %5398, %5434 + %5439 = and i1 %5398, %5435 + %5440 = and i1 %5398, %5436 + %5441 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5264 + %5442 = select i1 %5437, i32 16, i32 0 + %5443 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5267 + %5444 = select i1 %5438, i32 16, i32 0 + %5445 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5270 + %5446 = select i1 %5439, i32 16, i32 0 + %5447 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5273 + %5448 = select i1 %5440, i32 16, i32 0 + %5449 = extractelement <2 x i32> %5414, i64 0 + %5450 = icmp slt i32 %5449, %18 + %5451 = extractelement <2 x i32> %5414, i64 1 + %5452 = icmp slt i32 %5451, %18 + %5453 = icmp slt i32 %5415, %18 + %5454 = icmp slt i32 %5416, %18 + %5455 = icmp slt i32 %5417, %18 + %5456 = icmp slt i32 %5418, %18 + %5457 = icmp slt i32 %5419, %18 + %5458 = icmp slt i32 %5420, %18 + %5459 = icmp slt i32 %5421, %18 + %5460 = icmp slt i32 %5422, %18 + %5461 = icmp slt i32 %5423, %18 + %5462 = icmp slt i32 %5424, %18 + %5463 = icmp slt i32 %5425, %18 + %5464 = icmp slt i32 %5426, %18 + %5465 = icmp slt i32 %5427, %18 + %5466 = icmp slt i32 %5428, %18 + %5467 = sext i32 %5449 to i64 + %5468 = sext i32 %5451 to i64 + %5469 = sext i32 %5415 to i64 + %5470 = sext i32 %5416 to i64 + %5471 = sext i32 %5417 to i64 + %5472 = sext i32 %5418 to i64 + %5473 = sext i32 %5419 to i64 + %5474 = sext i32 %5420 to i64 + %5475 = sext i32 %5421 to i64 + %5476 = sext i32 %5422 to i64 + %5477 = sext i32 %5423 to i64 + %5478 = sext i32 %5424 to i64 + %5479 = sext i32 %5425 to i64 + %5480 = sext i32 %5426 to i64 + %5481 = sext i32 %5427 to i64 + %5482 = sext i32 %5428 to i64 + %5483 = and i1 %5398, %5450 + %5484 = and i1 %5398, %5452 + %5485 = and i1 %5398, %5453 + %5486 = and i1 %5398, %5454 + %5487 = and i1 %5398, %5455 + %5488 = and i1 %5398, %5456 + %5489 = and i1 %5398, %5457 + %5490 = and i1 %5398, %5458 + %5491 = and i1 %5398, %5459 + %5492 = and i1 %5398, %5460 + %5493 = and i1 %5398, %5461 + %5494 = and i1 %5398, %5462 + %5495 = and i1 %5398, %5463 + %5496 = and i1 %5398, %5464 + %5497 = and i1 %5398, %5465 + %5498 = and i1 %5398, %5466 + %5499 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5332 + %5500 = select i1 %5483, i32 4, i32 0 + %5501 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5334 + %5502 = select i1 %5484, i32 4, i32 0 + %5503 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5336 + %5504 = select i1 %5485, i32 4, i32 0 + %5505 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5339 + %5506 = select i1 %5486, i32 4, i32 0 + %5507 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5342 + %5508 = select i1 %5487, i32 4, i32 0 + %5509 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5345 + %5510 = select i1 %5488, i32 4, i32 0 + %5511 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5348 + %5512 = select i1 %5489, i32 4, i32 0 + %5513 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5351 + %5514 = select i1 %5490, i32 4, i32 0 + %5515 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5354 + %5516 = select i1 %5491, i32 4, i32 0 + %5517 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5357 + %5518 = select i1 %5492, i32 4, i32 0 + %5519 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5360 + %5520 = select i1 %5493, i32 4, i32 0 + %5521 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5363 + %5522 = select i1 %5494, i32 4, i32 0 + %5523 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5366 + %5524 = select i1 %5495, i32 4, i32 0 + %5525 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5369 + %5526 = select i1 %5496, i32 4, i32 0 + %5527 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5372 + %5528 = select i1 %5497, i32 4, i32 0 + %5529 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5375 + %5530 = select i1 %5498, i32 4, i32 0 + %5531 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5264 + %5532 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5267 + %5533 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5270 + %5534 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5273 + %5535 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5332 + %5536 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5334 + %5537 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5336 + %5538 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5339 + %5539 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5342 + %5540 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5345 + %5541 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5348 + %5542 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5351 + %5543 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5354 + %5544 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5357 + %5545 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5360 + %5546 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5363 + %5547 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5366 + %5548 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5369 + %5549 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5372 + %5550 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5375 + %5551 = add i32 %5163, -2 + %5552 = add nsw i32 %5163, -1 + %5553 = sext i32 %5237 to i64 + %5554 = sext i32 %5238 to i64 + %5555 = sext i32 %5239 to i64 + %5556 = sext i32 %5240 to i64 + %5557 = sext i32 %5241 to i64 + %5558 = sext i32 %5242 to i64 + %5559 = sext i32 %5243 to i64 + %5560 = sext i32 %5244 to i64 + %5561 = icmp sgt i32 %5245, 0 + %5562 = icmp slt i32 %5233, %18 + %5563 = icmp slt i32 %5234, %18 + %5564 = icmp slt i32 %5235, %18 + %5565 = icmp slt i32 %5236, %18 + %5566 = and i1 %5561, %5562 + %5567 = and i1 %5561, %5563 + %5568 = and i1 %5561, %5564 + %5569 = and i1 %5561, %5565 + %5570 = select i1 %5566, i32 16, i32 0 + %5571 = select i1 %5567, i32 16, i32 0 + %5572 = select i1 %5568, i32 16, i32 0 + %5573 = select i1 %5569, i32 16, i32 0 + %5574 = icmp slt i32 %5205, %18 + %5575 = icmp slt i32 %5206, %18 + %5576 = icmp slt i32 %5207, %18 + %5577 = icmp slt i32 %5208, %18 + %5578 = icmp slt i32 %5210, %18 + %5579 = icmp slt i32 %5212, %18 + %5580 = icmp slt i32 %5214, %18 + %5581 = icmp slt i32 %5216, %18 + %5582 = icmp slt i32 %5218, %18 + %5583 = icmp slt i32 %5220, %18 + %5584 = icmp slt i32 %5222, %18 + %5585 = icmp slt i32 %5224, %18 + %5586 = icmp slt i32 %5226, %18 + %5587 = icmp slt i32 %5228, %18 + %5588 = icmp slt i32 %5230, %18 + %5589 = icmp slt i32 %5232, %18 + %5590 = sext i32 %5205 to i64 + %5591 = sext i32 %5206 to i64 + %5592 = sext i32 %5207 to i64 + %5593 = sext i32 %5208 to i64 + %5594 = sext i32 %5210 to i64 + %5595 = sext i32 %5212 to i64 + %5596 = sext i32 %5214 to i64 + %5597 = sext i32 %5216 to i64 + %5598 = sext i32 %5218 to i64 + %5599 = sext i32 %5220 to i64 + %5600 = sext i32 %5222 to i64 + %5601 = sext i32 %5224 to i64 + %5602 = sext i32 %5226 to i64 + %5603 = sext i32 %5228 to i64 + %5604 = sext i32 %5230 to i64 + %5605 = sext i32 %5232 to i64 + %5606 = and i1 %5561, %5574 + %5607 = and i1 %5561, %5575 + %5608 = and i1 %5561, %5576 + %5609 = and i1 %5561, %5577 + %5610 = and i1 %5561, %5578 + %5611 = and i1 %5561, %5579 + %5612 = and i1 %5561, %5580 + %5613 = and i1 %5561, %5581 + %5614 = and i1 %5561, %5582 + %5615 = and i1 %5561, %5583 + %5616 = and i1 %5561, %5584 + %5617 = and i1 %5561, %5585 + %5618 = and i1 %5561, %5586 + %5619 = and i1 %5561, %5587 + %5620 = and i1 %5561, %5588 + %5621 = and i1 %5561, %5589 + %5622 = select i1 %5606, i32 4, i32 0 + %5623 = select i1 %5607, i32 4, i32 0 + %5624 = select i1 %5608, i32 4, i32 0 + %5625 = select i1 %5609, i32 4, i32 0 + %5626 = select i1 %5610, i32 4, i32 0 + %5627 = select i1 %5611, i32 4, i32 0 + %5628 = select i1 %5612, i32 4, i32 0 + %5629 = select i1 %5613, i32 4, i32 0 + %5630 = select i1 %5614, i32 4, i32 0 + %5631 = select i1 %5615, i32 4, i32 0 + %5632 = select i1 %5616, i32 4, i32 0 + %5633 = select i1 %5617, i32 4, i32 0 + %5634 = select i1 %5618, i32 4, i32 0 + %5635 = select i1 %5619, i32 4, i32 0 + %5636 = select i1 %5620, i32 4, i32 0 + %5637 = select i1 %5621, i32 4, i32 0 + %5638 = icmp sgt i32 %5246, 1 + %5639 = or disjoint i32 %5205, 64 + %5640 = or disjoint i32 %5206, 64 + %5641 = or disjoint i32 %5207, 64 + %5642 = or disjoint i32 %5208, 64 + %5643 = or disjoint i32 %5210, 64 + %5644 = or disjoint i32 %5212, 64 + %5645 = or disjoint i32 %5214, 64 + %5646 = or disjoint i32 %5216, 64 + %5647 = or disjoint i32 %5218, 64 + %5648 = or disjoint i32 %5220, 64 + %5649 = or disjoint i32 %5222, 64 + %5650 = or disjoint i32 %5224, 64 + %5651 = or disjoint i32 %5226, 64 + %5652 = or disjoint i32 %5228, 64 + %5653 = or disjoint i32 %5230, 64 + %5654 = or disjoint i32 %5232, 64 + %5655 = or disjoint i32 %5233, 64 + %5656 = or disjoint i32 %5234, 64 + %5657 = or disjoint i32 %5235, 64 + %5658 = or disjoint i32 %5236, 64 + %5659 = icmp slt i32 %5655, %18 + %5660 = icmp slt i32 %5656, %18 + %5661 = icmp slt i32 %5657, %18 + %5662 = icmp slt i32 %5658, %18 + %5663 = and i1 %5638, %5659 + %5664 = and i1 %5638, %5660 + %5665 = and i1 %5638, %5661 + %5666 = and i1 %5638, %5662 + %5667 = select i1 %5663, i32 16, i32 0 + %5668 = select i1 %5664, i32 16, i32 0 + %5669 = select i1 %5665, i32 16, i32 0 + %5670 = select i1 %5666, i32 16, i32 0 + %5671 = icmp slt i32 %5639, %18 + %5672 = icmp slt i32 %5640, %18 + %5673 = icmp slt i32 %5641, %18 + %5674 = icmp slt i32 %5642, %18 + %5675 = icmp slt i32 %5643, %18 + %5676 = icmp slt i32 %5644, %18 + %5677 = icmp slt i32 %5645, %18 + %5678 = icmp slt i32 %5646, %18 + %5679 = icmp slt i32 %5647, %18 + %5680 = icmp slt i32 %5648, %18 + %5681 = icmp slt i32 %5649, %18 + %5682 = icmp slt i32 %5650, %18 + %5683 = icmp slt i32 %5651, %18 + %5684 = icmp slt i32 %5652, %18 + %5685 = icmp slt i32 %5653, %18 + %5686 = icmp slt i32 %5654, %18 + %5687 = sext i32 %5639 to i64 + %5688 = sext i32 %5640 to i64 + %5689 = sext i32 %5641 to i64 + %5690 = sext i32 %5642 to i64 + %5691 = sext i32 %5643 to i64 + %5692 = sext i32 %5644 to i64 + %5693 = sext i32 %5645 to i64 + %5694 = sext i32 %5646 to i64 + %5695 = sext i32 %5647 to i64 + %5696 = sext i32 %5648 to i64 + %5697 = sext i32 %5649 to i64 + %5698 = sext i32 %5650 to i64 + %5699 = sext i32 %5651 to i64 + %5700 = sext i32 %5652 to i64 + %5701 = sext i32 %5653 to i64 + %5702 = sext i32 %5654 to i64 + %5703 = and i1 %5638, %5671 + %5704 = and i1 %5638, %5672 + %5705 = and i1 %5638, %5673 + %5706 = and i1 %5638, %5674 + %5707 = and i1 %5638, %5675 + %5708 = and i1 %5638, %5676 + %5709 = and i1 %5638, %5677 + %5710 = and i1 %5638, %5678 + %5711 = and i1 %5638, %5679 + %5712 = and i1 %5638, %5680 + %5713 = and i1 %5638, %5681 + %5714 = and i1 %5638, %5682 + %5715 = and i1 %5638, %5683 + %5716 = and i1 %5638, %5684 + %5717 = and i1 %5638, %5685 + %5718 = and i1 %5638, %5686 + %5719 = select i1 %5703, i32 4, i32 0 + %5720 = select i1 %5704, i32 4, i32 0 + %5721 = select i1 %5705, i32 4, i32 0 + %5722 = select i1 %5706, i32 4, i32 0 + %5723 = select i1 %5707, i32 4, i32 0 + %5724 = select i1 %5708, i32 4, i32 0 + %5725 = select i1 %5709, i32 4, i32 0 + %5726 = select i1 %5710, i32 4, i32 0 + %5727 = select i1 %5711, i32 4, i32 0 + %5728 = select i1 %5712, i32 4, i32 0 + %5729 = select i1 %5713, i32 4, i32 0 + %5730 = select i1 %5714, i32 4, i32 0 + %5731 = select i1 %5715, i32 4, i32 0 + %5732 = select i1 %5716, i32 4, i32 0 + %5733 = select i1 %5717, i32 4, i32 0 + %5734 = select i1 %5718, i32 4, i32 0 + %5735 = add i32 %5246, -2 + %5736 = add nsw i32 %5246, -1 + %smax2185 = tail call i32 @llvm.smax.i32(i32 %5163, i32 1), !dbg !276 + %smax2187 = tail call i32 @llvm.smax.i32(i32 %5246, i32 1), !dbg !276 + %5737 = zext nneg i32 %5120 to i64, !dbg !276 + %5738 = insertelement <2 x i32> poison, i32 %18, i64 0 + %5739 = shufflevector <2 x i32> %5738, <2 x i32> poison, <2 x i32> zeroinitializer + %5740 = insertelement <4 x i32> poison, i32 %18, i64 0 + %5741 = shufflevector <4 x i32> %5740, <4 x i32> poison, <8 x i32> zeroinitializer + %5742 = shufflevector <2 x i32> %5177, <2 x i32> poison, <8 x i32> + %5743 = insertelement <8 x i32> poison, i32 %26, i64 0 + %5744 = shufflevector <8 x i32> %5743, <8 x i32> poison, <8 x i32> zeroinitializer + %5745 = insertelement <8 x i32> poison, i32 %18, i64 0 + %5746 = shufflevector <8 x i32> %5745, <8 x i32> poison, <8 x i32> zeroinitializer + %5747 = insertelement <8 x i64> poison, i64 %5182, i64 0 + %5748 = shufflevector <8 x i64> %5747, <8 x i64> poison, <8 x i32> zeroinitializer + %5749 = insertelement <16 x i32> poison, i32 %26, i64 0 + %5750 = shufflevector <16 x i32> %5749, <16 x i32> poison, <16 x i32> zeroinitializer + %5751 = shufflevector <2 x i32> %5177, <2 x i32> poison, <2 x i32> zeroinitializer + %5752 = insertelement <2 x i64> poison, i64 %5182, i64 0 + %5753 = shufflevector <2 x i64> %5752, <2 x i64> poison, <2 x i32> zeroinitializer + %5754 = insertelement <2 x i32> poison, i32 %26, i64 0 + %5755 = shufflevector <2 x i32> %5754, <2 x i32> poison, <2 x i32> zeroinitializer + %5756 = insertelement <2 x i1> poison, i1 %5199, i64 0 + %5757 = shufflevector <2 x i1> %5756, <2 x i1> poison, <2 x i32> zeroinitializer + %5758 = shufflevector <2 x i32> %5177, <2 x i32> poison, <2 x i32> + %5759 = insertelement <2 x i1> poison, i1 %5198, i64 0 + %5760 = shufflevector <2 x i1> %5759, <2 x i1> poison, <2 x i32> zeroinitializer + br label %5761, !dbg !276 + +5761: ; preds = %4875, %._crit_edge1794 + %indvars.iv = phi i64 [ 0, %4875 ], [ %indvars.iv.next, %._crit_edge1794 ] + %5762 = phi float [ 0.000000e+00, %4875 ], [ %11300, %._crit_edge1794 ] + %5763 = phi float [ 0.000000e+00, %4875 ], [ %11301, %._crit_edge1794 ] + %5764 = phi float [ 0.000000e+00, %4875 ], [ %11302, %._crit_edge1794 ] + %5765 = phi float [ 0.000000e+00, %4875 ], [ %11303, %._crit_edge1794 ] + %5766 = phi float [ 0.000000e+00, %4875 ], [ %11304, %._crit_edge1794 ] + %5767 = phi float [ 0.000000e+00, %4875 ], [ %11305, %._crit_edge1794 ] + %5768 = phi float [ 0.000000e+00, %4875 ], [ %11306, %._crit_edge1794 ] + %5769 = phi float [ 0.000000e+00, %4875 ], [ %11307, %._crit_edge1794 ] + %5770 = phi float [ 0.000000e+00, %4875 ], [ %11308, %._crit_edge1794 ] + %5771 = phi float [ 0.000000e+00, %4875 ], [ %11309, %._crit_edge1794 ] + %5772 = phi float [ 0.000000e+00, %4875 ], [ %11310, %._crit_edge1794 ] + %5773 = phi float [ 0.000000e+00, %4875 ], [ %11311, %._crit_edge1794 ] + %5774 = phi float [ 0.000000e+00, %4875 ], [ %11312, %._crit_edge1794 ] + %5775 = phi float [ 0.000000e+00, %4875 ], [ %11313, %._crit_edge1794 ] + %5776 = phi float [ 0.000000e+00, %4875 ], [ %11314, %._crit_edge1794 ] + %5777 = phi float [ 0.000000e+00, %4875 ], [ %11315, %._crit_edge1794 ] + %5778 = phi float [ 0.000000e+00, %4875 ], [ %11316, %._crit_edge1794 ] + %5779 = phi float [ 0.000000e+00, %4875 ], [ %11317, %._crit_edge1794 ] + %5780 = phi float [ 0.000000e+00, %4875 ], [ %11318, %._crit_edge1794 ] + %5781 = phi float [ 0.000000e+00, %4875 ], [ %11319, %._crit_edge1794 ] + %5782 = phi float [ 0.000000e+00, %4875 ], [ %11320, %._crit_edge1794 ] + %5783 = phi float [ 0.000000e+00, %4875 ], [ %11321, %._crit_edge1794 ] + %5784 = phi float [ 0.000000e+00, %4875 ], [ %11322, %._crit_edge1794 ] + %5785 = phi float [ 0.000000e+00, %4875 ], [ %11323, %._crit_edge1794 ] + %5786 = phi float [ 0.000000e+00, %4875 ], [ %11324, %._crit_edge1794 ] + %5787 = phi float [ 0.000000e+00, %4875 ], [ %11325, %._crit_edge1794 ] + %5788 = phi float [ 0.000000e+00, %4875 ], [ %11326, %._crit_edge1794 ] + %5789 = phi float [ 0.000000e+00, %4875 ], [ %11327, %._crit_edge1794 ] + %5790 = phi float [ 0.000000e+00, %4875 ], [ %11328, %._crit_edge1794 ] + %5791 = phi float [ 0.000000e+00, %4875 ], [ %11329, %._crit_edge1794 ] + %5792 = phi float [ 0.000000e+00, %4875 ], [ %11330, %._crit_edge1794 ] + %5793 = phi float [ 0.000000e+00, %4875 ], [ %11331, %._crit_edge1794 ] + %5794 = phi float [ 0.000000e+00, %4875 ], [ %11332, %._crit_edge1794 ] + %5795 = phi float [ 0.000000e+00, %4875 ], [ %11333, %._crit_edge1794 ] + %5796 = phi float [ 0.000000e+00, %4875 ], [ %11334, %._crit_edge1794 ] + %5797 = phi float [ 0.000000e+00, %4875 ], [ %11335, %._crit_edge1794 ] + %5798 = phi float [ 0.000000e+00, %4875 ], [ %11336, %._crit_edge1794 ] + %5799 = phi float [ 0.000000e+00, %4875 ], [ %11337, %._crit_edge1794 ] + %5800 = phi float [ 0.000000e+00, %4875 ], [ %11338, %._crit_edge1794 ] + %5801 = phi float [ 0.000000e+00, %4875 ], [ %11339, %._crit_edge1794 ] + %5802 = phi float [ 0.000000e+00, %4875 ], [ %11340, %._crit_edge1794 ] + %5803 = phi float [ 0.000000e+00, %4875 ], [ %11341, %._crit_edge1794 ] + %5804 = phi float [ 0.000000e+00, %4875 ], [ %11342, %._crit_edge1794 ] + %5805 = phi float [ 0.000000e+00, %4875 ], [ %11343, %._crit_edge1794 ] + %5806 = phi float [ 0.000000e+00, %4875 ], [ %11344, %._crit_edge1794 ] + %5807 = phi float [ 0.000000e+00, %4875 ], [ %11345, %._crit_edge1794 ] + %5808 = phi float [ 0.000000e+00, %4875 ], [ %11346, %._crit_edge1794 ] + %5809 = phi float [ 0.000000e+00, %4875 ], [ %11347, %._crit_edge1794 ] + %5810 = phi float [ 0.000000e+00, %4875 ], [ %11348, %._crit_edge1794 ] + %5811 = phi float [ 0.000000e+00, %4875 ], [ %11349, %._crit_edge1794 ] + %5812 = phi float [ 0.000000e+00, %4875 ], [ %11350, %._crit_edge1794 ] + %5813 = phi float [ 0.000000e+00, %4875 ], [ %11351, %._crit_edge1794 ] + %5814 = phi float [ 0.000000e+00, %4875 ], [ %11352, %._crit_edge1794 ] + %5815 = phi float [ 0.000000e+00, %4875 ], [ %11353, %._crit_edge1794 ] + %5816 = phi float [ 0.000000e+00, %4875 ], [ %11354, %._crit_edge1794 ] + %5817 = phi float [ 0.000000e+00, %4875 ], [ %11355, %._crit_edge1794 ] + %5818 = phi float [ 0.000000e+00, %4875 ], [ %11356, %._crit_edge1794 ] + %5819 = phi float [ 0.000000e+00, %4875 ], [ %11357, %._crit_edge1794 ] + %5820 = phi float [ 0.000000e+00, %4875 ], [ %11358, %._crit_edge1794 ] + %5821 = phi float [ 0.000000e+00, %4875 ], [ %11359, %._crit_edge1794 ] + %5822 = phi float [ 0.000000e+00, %4875 ], [ %11360, %._crit_edge1794 ] + %5823 = phi float [ 0.000000e+00, %4875 ], [ %11361, %._crit_edge1794 ] + %5824 = phi float [ 0.000000e+00, %4875 ], [ %11362, %._crit_edge1794 ] + %5825 = phi float [ 0.000000e+00, %4875 ], [ %11363, %._crit_edge1794 ] + %5826 = phi float [ 0.000000e+00, %4875 ], [ %11236, %._crit_edge1794 ] + %5827 = phi float [ 0.000000e+00, %4875 ], [ %11237, %._crit_edge1794 ] + %5828 = phi float [ 0.000000e+00, %4875 ], [ %11238, %._crit_edge1794 ] + %5829 = phi float [ 0.000000e+00, %4875 ], [ %11239, %._crit_edge1794 ] + %5830 = phi float [ 0.000000e+00, %4875 ], [ %11240, %._crit_edge1794 ] + %5831 = phi float [ 0.000000e+00, %4875 ], [ %11241, %._crit_edge1794 ] + %5832 = phi float [ 0.000000e+00, %4875 ], [ %11242, %._crit_edge1794 ] + %5833 = phi float [ 0.000000e+00, %4875 ], [ %11243, %._crit_edge1794 ] + %5834 = phi float [ 0.000000e+00, %4875 ], [ %11244, %._crit_edge1794 ] + %5835 = phi float [ 0.000000e+00, %4875 ], [ %11245, %._crit_edge1794 ] + %5836 = phi float [ 0.000000e+00, %4875 ], [ %11246, %._crit_edge1794 ] + %5837 = phi float [ 0.000000e+00, %4875 ], [ %11247, %._crit_edge1794 ] + %5838 = phi float [ 0.000000e+00, %4875 ], [ %11248, %._crit_edge1794 ] + %5839 = phi float [ 0.000000e+00, %4875 ], [ %11249, %._crit_edge1794 ] + %5840 = phi float [ 0.000000e+00, %4875 ], [ %11250, %._crit_edge1794 ] + %5841 = phi float [ 0.000000e+00, %4875 ], [ %11251, %._crit_edge1794 ] + %5842 = phi float [ 0.000000e+00, %4875 ], [ %11252, %._crit_edge1794 ] + %5843 = phi float [ 0.000000e+00, %4875 ], [ %11253, %._crit_edge1794 ] + %5844 = phi float [ 0.000000e+00, %4875 ], [ %11254, %._crit_edge1794 ] + %5845 = phi float [ 0.000000e+00, %4875 ], [ %11255, %._crit_edge1794 ] + %5846 = phi float [ 0.000000e+00, %4875 ], [ %11256, %._crit_edge1794 ] + %5847 = phi float [ 0.000000e+00, %4875 ], [ %11257, %._crit_edge1794 ] + %5848 = phi float [ 0.000000e+00, %4875 ], [ %11258, %._crit_edge1794 ] + %5849 = phi float [ 0.000000e+00, %4875 ], [ %11259, %._crit_edge1794 ] + %5850 = phi float [ 0.000000e+00, %4875 ], [ %11260, %._crit_edge1794 ] + %5851 = phi float [ 0.000000e+00, %4875 ], [ %11261, %._crit_edge1794 ] + %5852 = phi float [ 0.000000e+00, %4875 ], [ %11262, %._crit_edge1794 ] + %5853 = phi float [ 0.000000e+00, %4875 ], [ %11263, %._crit_edge1794 ] + %5854 = phi float [ 0.000000e+00, %4875 ], [ %11264, %._crit_edge1794 ] + %5855 = phi float [ 0.000000e+00, %4875 ], [ %11265, %._crit_edge1794 ] + %5856 = phi float [ 0.000000e+00, %4875 ], [ %11266, %._crit_edge1794 ] + %5857 = phi float [ 0.000000e+00, %4875 ], [ %11267, %._crit_edge1794 ] + %5858 = phi float [ 0.000000e+00, %4875 ], [ %11268, %._crit_edge1794 ] + %5859 = phi float [ 0.000000e+00, %4875 ], [ %11269, %._crit_edge1794 ] + %5860 = phi float [ 0.000000e+00, %4875 ], [ %11270, %._crit_edge1794 ] + %5861 = phi float [ 0.000000e+00, %4875 ], [ %11271, %._crit_edge1794 ] + %5862 = phi float [ 0.000000e+00, %4875 ], [ %11272, %._crit_edge1794 ] + %5863 = phi float [ 0.000000e+00, %4875 ], [ %11273, %._crit_edge1794 ] + %5864 = phi float [ 0.000000e+00, %4875 ], [ %11274, %._crit_edge1794 ] + %5865 = phi float [ 0.000000e+00, %4875 ], [ %11275, %._crit_edge1794 ] + %5866 = phi float [ 0.000000e+00, %4875 ], [ %11276, %._crit_edge1794 ] + %5867 = phi float [ 0.000000e+00, %4875 ], [ %11277, %._crit_edge1794 ] + %5868 = phi float [ 0.000000e+00, %4875 ], [ %11278, %._crit_edge1794 ] + %5869 = phi float [ 0.000000e+00, %4875 ], [ %11279, %._crit_edge1794 ] + %5870 = phi float [ 0.000000e+00, %4875 ], [ %11280, %._crit_edge1794 ] + %5871 = phi float [ 0.000000e+00, %4875 ], [ %11281, %._crit_edge1794 ] + %5872 = phi float [ 0.000000e+00, %4875 ], [ %11282, %._crit_edge1794 ] + %5873 = phi float [ 0.000000e+00, %4875 ], [ %11283, %._crit_edge1794 ] + %5874 = phi float [ 0.000000e+00, %4875 ], [ %11284, %._crit_edge1794 ] + %5875 = phi float [ 0.000000e+00, %4875 ], [ %11285, %._crit_edge1794 ] + %5876 = phi float [ 0.000000e+00, %4875 ], [ %11286, %._crit_edge1794 ] + %5877 = phi float [ 0.000000e+00, %4875 ], [ %11287, %._crit_edge1794 ] + %5878 = phi float [ 0.000000e+00, %4875 ], [ %11288, %._crit_edge1794 ] + %5879 = phi float [ 0.000000e+00, %4875 ], [ %11289, %._crit_edge1794 ] + %5880 = phi float [ 0.000000e+00, %4875 ], [ %11290, %._crit_edge1794 ] + %5881 = phi float [ 0.000000e+00, %4875 ], [ %11291, %._crit_edge1794 ] + %5882 = phi float [ 0.000000e+00, %4875 ], [ %11292, %._crit_edge1794 ] + %5883 = phi float [ 0.000000e+00, %4875 ], [ %11293, %._crit_edge1794 ] + %5884 = phi float [ 0.000000e+00, %4875 ], [ %11294, %._crit_edge1794 ] + %5885 = phi float [ 0.000000e+00, %4875 ], [ %11295, %._crit_edge1794 ] + %5886 = phi float [ 0.000000e+00, %4875 ], [ %11296, %._crit_edge1794 ] + %5887 = phi float [ 0.000000e+00, %4875 ], [ %11297, %._crit_edge1794 ] + %5888 = phi float [ 0.000000e+00, %4875 ], [ %11298, %._crit_edge1794 ] + %5889 = phi float [ 0.000000e+00, %4875 ], [ %11299, %._crit_edge1794 ] + %5890 = add nuw nsw i64 %indvars.iv, %5737, !dbg !277 + %.tr = trunc i64 %5890 to i32, !dbg !278 + %5891 = shl i32 %.tr, 7, !dbg !278 + %5892 = add i32 %5891, %5121, !dbg !278 + %5893 = sext i32 %5892 to i64, !dbg !279 + %5894 = trunc nuw nsw i64 %5890 to i32, !dbg !280 + %5895 = mul i32 %39, %5894, !dbg !280 + %5896 = add i32 %5895, %5122, !dbg !281 + %5897 = sext i32 %5896 to i64, !dbg !282 + %5898 = trunc i64 %5890 to i32, !dbg !283 + %5899 = add i32 %5123, %5898, !dbg !283 + %5900 = mul i32 %5899, %18, !dbg !283 + %5901 = sext i32 %5900 to i64, !dbg !284 + %5902 = getelementptr bfloat, ptr addrspace(1) %0, i64 %5893, !dbg !285 + %5903 = getelementptr bfloat, ptr addrspace(1) %5, i64 %5897, !dbg !286 + %5904 = getelementptr float, ptr addrspace(1) %3, i64 %5901, !dbg !287 + %5905 = getelementptr float, ptr addrspace(1) %4, i64 %5901, !dbg !288 + %5906 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5247, !dbg !289 + %5907 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5248, !dbg !289 + %5908 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5249, !dbg !289 + %5909 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5250, !dbg !289 + %5910 = getelementptr bfloat, ptr addrspace(1) %5906, i64 %4911, !dbg !290 + %5911 = getelementptr bfloat, ptr addrspace(1) %5907, i64 %4911, !dbg !290 + %5912 = getelementptr bfloat, ptr addrspace(1) %5908, i64 %4911, !dbg !290 + %5913 = getelementptr bfloat, ptr addrspace(1) %5909, i64 %4911, !dbg !290 + %5914 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5251, !dbg !291 + %5915 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5252, !dbg !291 + %5916 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5253, !dbg !291 + %5917 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5254, !dbg !291 + %5918 = getelementptr bfloat, ptr addrspace(1) %5914, i64 %4911, !dbg !292 + %5919 = getelementptr bfloat, ptr addrspace(1) %5915, i64 %4911, !dbg !292 + %5920 = getelementptr bfloat, ptr addrspace(1) %5916, i64 %4911, !dbg !292 + %5921 = getelementptr bfloat, ptr addrspace(1) %5917, i64 %4911, !dbg !292 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5265, ptr addrspace(1) %5910, i32 %5266) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5268, ptr addrspace(1) %5911, i32 %5269) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5271, ptr addrspace(1) %5912, i32 %5272) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5274, ptr addrspace(1) %5913, i32 %5275) #3, !dbg !293 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !293 + %5922 = getelementptr float, ptr addrspace(1) %5904, i64 %5408, !dbg !294 + %5923 = getelementptr float, ptr addrspace(1) %5904, i64 %5409, !dbg !294 + %5924 = getelementptr float, ptr addrspace(1) %5904, i64 %5302, !dbg !294 + %5925 = getelementptr float, ptr addrspace(1) %5904, i64 %5303, !dbg !294 + %5926 = getelementptr float, ptr addrspace(1) %5904, i64 %5304, !dbg !294 + %5927 = getelementptr float, ptr addrspace(1) %5904, i64 %5305, !dbg !294 + %5928 = getelementptr float, ptr addrspace(1) %5904, i64 %5306, !dbg !294 + %5929 = getelementptr float, ptr addrspace(1) %5904, i64 %5307, !dbg !294 + %5930 = getelementptr float, ptr addrspace(1) %5904, i64 %5308, !dbg !294 + %5931 = getelementptr float, ptr addrspace(1) %5904, i64 %5309, !dbg !294 + %5932 = getelementptr float, ptr addrspace(1) %5904, i64 %5310, !dbg !294 + %5933 = getelementptr float, ptr addrspace(1) %5904, i64 %5311, !dbg !294 + %5934 = getelementptr float, ptr addrspace(1) %5904, i64 %5312, !dbg !294 + %5935 = getelementptr float, ptr addrspace(1) %5904, i64 %5313, !dbg !294 + %5936 = getelementptr float, ptr addrspace(1) %5904, i64 %5314, !dbg !294 + %5937 = getelementptr float, ptr addrspace(1) %5904, i64 %5315, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5333, ptr addrspace(1) %5922, i32 %5412, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5335, ptr addrspace(1) %5923, i32 %5413, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5337, ptr addrspace(1) %5924, i32 %5338, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5340, ptr addrspace(1) %5925, i32 %5341, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5343, ptr addrspace(1) %5926, i32 %5344, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5346, ptr addrspace(1) %5927, i32 %5347, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5349, ptr addrspace(1) %5928, i32 %5350, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5352, ptr addrspace(1) %5929, i32 %5353, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5355, ptr addrspace(1) %5930, i32 %5356, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5358, ptr addrspace(1) %5931, i32 %5359, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5361, ptr addrspace(1) %5932, i32 %5362, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5364, ptr addrspace(1) %5933, i32 %5365, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5367, ptr addrspace(1) %5934, i32 %5368, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5370, ptr addrspace(1) %5935, i32 %5371, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5373, ptr addrspace(1) %5936, i32 %5374, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5376, ptr addrspace(1) %5937, i32 %5377, i1 %5331) #3, !dbg !295 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !295 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5378, ptr addrspace(1) %5918, i32 %5266) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5379, ptr addrspace(1) %5919, i32 %5269) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5380, ptr addrspace(1) %5920, i32 %5272) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5381, ptr addrspace(1) %5921, i32 %5275) #3, !dbg !296 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !296 + %5938 = getelementptr float, ptr addrspace(1) %5905, i64 %5408, !dbg !297 + %5939 = getelementptr float, ptr addrspace(1) %5905, i64 %5409, !dbg !297 + %5940 = getelementptr float, ptr addrspace(1) %5905, i64 %5302, !dbg !297 + %5941 = getelementptr float, ptr addrspace(1) %5905, i64 %5303, !dbg !297 + %5942 = getelementptr float, ptr addrspace(1) %5905, i64 %5304, !dbg !297 + %5943 = getelementptr float, ptr addrspace(1) %5905, i64 %5305, !dbg !297 + %5944 = getelementptr float, ptr addrspace(1) %5905, i64 %5306, !dbg !297 + %5945 = getelementptr float, ptr addrspace(1) %5905, i64 %5307, !dbg !297 + %5946 = getelementptr float, ptr addrspace(1) %5905, i64 %5308, !dbg !297 + %5947 = getelementptr float, ptr addrspace(1) %5905, i64 %5309, !dbg !297 + %5948 = getelementptr float, ptr addrspace(1) %5905, i64 %5310, !dbg !297 + %5949 = getelementptr float, ptr addrspace(1) %5905, i64 %5311, !dbg !297 + %5950 = getelementptr float, ptr addrspace(1) %5905, i64 %5312, !dbg !297 + %5951 = getelementptr float, ptr addrspace(1) %5905, i64 %5313, !dbg !297 + %5952 = getelementptr float, ptr addrspace(1) %5905, i64 %5314, !dbg !297 + %5953 = getelementptr float, ptr addrspace(1) %5905, i64 %5315, !dbg !297 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5382, ptr addrspace(1) %5938, i32 %5412, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5383, ptr addrspace(1) %5939, i32 %5413, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5384, ptr addrspace(1) %5940, i32 %5338, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5385, ptr addrspace(1) %5941, i32 %5341, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5386, ptr addrspace(1) %5942, i32 %5344, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5387, ptr addrspace(1) %5943, i32 %5347, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5388, ptr addrspace(1) %5944, i32 %5350, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5389, ptr addrspace(1) %5945, i32 %5353, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5390, ptr addrspace(1) %5946, i32 %5356, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5391, ptr addrspace(1) %5947, i32 %5359, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5392, ptr addrspace(1) %5948, i32 %5362, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5393, ptr addrspace(1) %5949, i32 %5365, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5394, ptr addrspace(1) %5950, i32 %5368, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5395, ptr addrspace(1) %5951, i32 %5371, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5396, ptr addrspace(1) %5952, i32 %5374, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5397, ptr addrspace(1) %5953, i32 %5377, i1 %5331) #3, !dbg !298 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !298 + %5954 = getelementptr i8, ptr addrspace(1) %5910, i64 524288, !dbg !299 + %5955 = getelementptr i8, ptr addrspace(1) %5911, i64 524288, !dbg !299 + %5956 = getelementptr i8, ptr addrspace(1) %5912, i64 524288, !dbg !299 + %5957 = getelementptr i8, ptr addrspace(1) %5913, i64 524288, !dbg !299 + %5958 = getelementptr i8, ptr addrspace(1) %5918, i64 16384, !dbg !300 + %5959 = getelementptr i8, ptr addrspace(1) %5919, i64 16384, !dbg !300 + %5960 = getelementptr i8, ptr addrspace(1) %5920, i64 16384, !dbg !300 + %5961 = getelementptr i8, ptr addrspace(1) %5921, i64 16384, !dbg !300 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5441, ptr addrspace(1) %5954, i32 %5442) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5443, ptr addrspace(1) %5955, i32 %5444) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5445, ptr addrspace(1) %5956, i32 %5446) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5447, ptr addrspace(1) %5957, i32 %5448) #3, !dbg !293 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !293 + %5962 = getelementptr float, ptr addrspace(1) %5904, i64 %5467, !dbg !294 + %5963 = getelementptr float, ptr addrspace(1) %5904, i64 %5468, !dbg !294 + %5964 = getelementptr float, ptr addrspace(1) %5904, i64 %5469, !dbg !294 + %5965 = getelementptr float, ptr addrspace(1) %5904, i64 %5470, !dbg !294 + %5966 = getelementptr float, ptr addrspace(1) %5904, i64 %5471, !dbg !294 + %5967 = getelementptr float, ptr addrspace(1) %5904, i64 %5472, !dbg !294 + %5968 = getelementptr float, ptr addrspace(1) %5904, i64 %5473, !dbg !294 + %5969 = getelementptr float, ptr addrspace(1) %5904, i64 %5474, !dbg !294 + %5970 = getelementptr float, ptr addrspace(1) %5904, i64 %5475, !dbg !294 + %5971 = getelementptr float, ptr addrspace(1) %5904, i64 %5476, !dbg !294 + %5972 = getelementptr float, ptr addrspace(1) %5904, i64 %5477, !dbg !294 + %5973 = getelementptr float, ptr addrspace(1) %5904, i64 %5478, !dbg !294 + %5974 = getelementptr float, ptr addrspace(1) %5904, i64 %5479, !dbg !294 + %5975 = getelementptr float, ptr addrspace(1) %5904, i64 %5480, !dbg !294 + %5976 = getelementptr float, ptr addrspace(1) %5904, i64 %5481, !dbg !294 + %5977 = getelementptr float, ptr addrspace(1) %5904, i64 %5482, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5499, ptr addrspace(1) %5962, i32 %5500, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5501, ptr addrspace(1) %5963, i32 %5502, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5503, ptr addrspace(1) %5964, i32 %5504, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5505, ptr addrspace(1) %5965, i32 %5506, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5507, ptr addrspace(1) %5966, i32 %5508, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5509, ptr addrspace(1) %5967, i32 %5510, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5511, ptr addrspace(1) %5968, i32 %5512, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5513, ptr addrspace(1) %5969, i32 %5514, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5515, ptr addrspace(1) %5970, i32 %5516, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5517, ptr addrspace(1) %5971, i32 %5518, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5519, ptr addrspace(1) %5972, i32 %5520, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5521, ptr addrspace(1) %5973, i32 %5522, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5523, ptr addrspace(1) %5974, i32 %5524, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5525, ptr addrspace(1) %5975, i32 %5526, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5527, ptr addrspace(1) %5976, i32 %5528, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5529, ptr addrspace(1) %5977, i32 %5530, i1 %5331) #3, !dbg !295 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !295 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5531, ptr addrspace(1) %5958, i32 %5442) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5532, ptr addrspace(1) %5959, i32 %5444) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5533, ptr addrspace(1) %5960, i32 %5446) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5534, ptr addrspace(1) %5961, i32 %5448) #3, !dbg !296 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !296 + %5978 = getelementptr float, ptr addrspace(1) %5905, i64 %5467, !dbg !297 + %5979 = getelementptr float, ptr addrspace(1) %5905, i64 %5468, !dbg !297 + %5980 = getelementptr float, ptr addrspace(1) %5905, i64 %5469, !dbg !297 + %5981 = getelementptr float, ptr addrspace(1) %5905, i64 %5470, !dbg !297 + %5982 = getelementptr float, ptr addrspace(1) %5905, i64 %5471, !dbg !297 + %5983 = getelementptr float, ptr addrspace(1) %5905, i64 %5472, !dbg !297 + %5984 = getelementptr float, ptr addrspace(1) %5905, i64 %5473, !dbg !297 + %5985 = getelementptr float, ptr addrspace(1) %5905, i64 %5474, !dbg !297 + %5986 = getelementptr float, ptr addrspace(1) %5905, i64 %5475, !dbg !297 + %5987 = getelementptr float, ptr addrspace(1) %5905, i64 %5476, !dbg !297 + %5988 = getelementptr float, ptr addrspace(1) %5905, i64 %5477, !dbg !297 + %5989 = getelementptr float, ptr addrspace(1) %5905, i64 %5478, !dbg !297 + %5990 = getelementptr float, ptr addrspace(1) %5905, i64 %5479, !dbg !297 + %5991 = getelementptr float, ptr addrspace(1) %5905, i64 %5480, !dbg !297 + %5992 = getelementptr float, ptr addrspace(1) %5905, i64 %5481, !dbg !297 + %5993 = getelementptr float, ptr addrspace(1) %5905, i64 %5482, !dbg !297 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5535, ptr addrspace(1) %5978, i32 %5500, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5536, ptr addrspace(1) %5979, i32 %5502, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5537, ptr addrspace(1) %5980, i32 %5504, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5538, ptr addrspace(1) %5981, i32 %5506, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5539, ptr addrspace(1) %5982, i32 %5508, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5540, ptr addrspace(1) %5983, i32 %5510, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5541, ptr addrspace(1) %5984, i32 %5512, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5542, ptr addrspace(1) %5985, i32 %5514, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5543, ptr addrspace(1) %5986, i32 %5516, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5544, ptr addrspace(1) %5987, i32 %5518, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5545, ptr addrspace(1) %5988, i32 %5520, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5546, ptr addrspace(1) %5989, i32 %5522, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5547, ptr addrspace(1) %5990, i32 %5524, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5548, ptr addrspace(1) %5991, i32 %5526, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5549, ptr addrspace(1) %5992, i32 %5528, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5550, ptr addrspace(1) %5993, i32 %5530, i1 %5331) #3, !dbg !298 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !298 + br i1 %5181, label %.lr.ph1620, label %._crit_edge1621, !dbg !255 + +.lr.ph1620: ; preds = %5761, %__nv_exp2f.exit1335 + %5994 = phi i32 [ %7716, %__nv_exp2f.exit1335 ], [ %5139, %5761 ] + %5995 = phi i32 [ %7717, %__nv_exp2f.exit1335 ], [ %5140, %5761 ] + %5996 = phi i32 [ %8469, %__nv_exp2f.exit1335 ], [ 64, %5761 ] + %5997 = phi i32 [ %.pn2151605, %__nv_exp2f.exit1335 ], [ %5139, %5761 ] + %5998 = phi i32 [ %.pn2131606, %__nv_exp2f.exit1335 ], [ %5140, %5761 ] + %5999 = phi i32 [ %.pn2111607, %__nv_exp2f.exit1335 ], [ %5278, %5761 ] + %6000 = phi i32 [ %.pn2091608, %__nv_exp2f.exit1335 ], [ %5280, %5761 ] + %6001 = phi i32 [ %.pn2071609, %__nv_exp2f.exit1335 ], [ %5282, %5761 ] + %6002 = phi i32 [ %.pn2051610, %__nv_exp2f.exit1335 ], [ %5284, %5761 ] + %6003 = phi i32 [ %.pn2031611, %__nv_exp2f.exit1335 ], [ %5286, %5761 ] + %6004 = phi i32 [ %.pn2011612, %__nv_exp2f.exit1335 ], [ %5288, %5761 ] + %6005 = phi i32 [ %.pn1991613, %__nv_exp2f.exit1335 ], [ %5290, %5761 ] + %6006 = phi i32 [ %.pn1971614, %__nv_exp2f.exit1335 ], [ %5292, %5761 ] + %6007 = phi i32 [ %.pn1951615, %__nv_exp2f.exit1335 ], [ %5294, %5761 ] + %6008 = phi i32 [ %.pn1931616, %__nv_exp2f.exit1335 ], [ %5296, %5761 ] + %6009 = phi i32 [ %.pn1911617, %__nv_exp2f.exit1335 ], [ %5298, %5761 ] + %6010 = phi i32 [ %.pn1891618, %__nv_exp2f.exit1335 ], [ %5300, %5761 ] + %6011 = phi i32 [ %6161, %__nv_exp2f.exit1335 ], [ -1, %5761 ] + %6012 = phi i32 [ %8509, %__nv_exp2f.exit1335 ], [ 1, %5761 ] + %6013 = phi i32 [ %6164, %__nv_exp2f.exit1335 ], [ -1, %5761 ] + %6014 = phi i32 [ %8512, %__nv_exp2f.exit1335 ], [ 1, %5761 ] + %.pn1891618 = phi i32 [ %8498, %__nv_exp2f.exit1335 ], [ %5428, %5761 ] + %.pn1911617 = phi i32 [ %8497, %__nv_exp2f.exit1335 ], [ %5427, %5761 ] + %.pn1931616 = phi i32 [ %8496, %__nv_exp2f.exit1335 ], [ %5426, %5761 ] + %.pn1951615 = phi i32 [ %8495, %__nv_exp2f.exit1335 ], [ %5425, %5761 ] + %.pn1971614 = phi i32 [ %8494, %__nv_exp2f.exit1335 ], [ %5424, %5761 ] + %.pn1991613 = phi i32 [ %8493, %__nv_exp2f.exit1335 ], [ %5423, %5761 ] + %.pn2011612 = phi i32 [ %8492, %__nv_exp2f.exit1335 ], [ %5422, %5761 ] + %.pn2031611 = phi i32 [ %8491, %__nv_exp2f.exit1335 ], [ %5421, %5761 ] + %.pn2051610 = phi i32 [ %8490, %__nv_exp2f.exit1335 ], [ %5420, %5761 ] + %.pn2071609 = phi i32 [ %8489, %__nv_exp2f.exit1335 ], [ %5419, %5761 ] + %.pn2091608 = phi i32 [ %8488, %__nv_exp2f.exit1335 ], [ %5418, %5761 ] + %.pn2111607 = phi i32 [ %8487, %__nv_exp2f.exit1335 ], [ %5417, %5761 ] + %.pn2131606 = phi i32 [ %8486, %__nv_exp2f.exit1335 ], [ %5416, %5761 ] + %.pn2151605 = phi i32 [ %8485, %__nv_exp2f.exit1335 ], [ %5415, %5761 ] + %6015 = phi i32 [ %8503, %__nv_exp2f.exit1335 ], [ %5429, %5761 ] + %6016 = phi i32 [ %8504, %__nv_exp2f.exit1335 ], [ %5430, %5761 ] + %6017 = phi i32 [ %8505, %__nv_exp2f.exit1335 ], [ %5431, %5761 ] + %6018 = phi i32 [ %8506, %__nv_exp2f.exit1335 ], [ %5432, %5761 ] + %.pn1391602 = phi ptr addrspace(1) [ %8481, %__nv_exp2f.exit1335 ], [ %5961, %5761 ] + %.pn1551601 = phi ptr addrspace(1) [ %8480, %__nv_exp2f.exit1335 ], [ %5960, %5761 ] + %.pn1711600 = phi ptr addrspace(1) [ %8479, %__nv_exp2f.exit1335 ], [ %5959, %5761 ] + %.pn1871599 = phi ptr addrspace(1) [ %8478, %__nv_exp2f.exit1335 ], [ %5958, %5761 ] + %6019 = phi i32 [ %8499, %__nv_exp2f.exit1335 ], [ %5429, %5761 ] + %6020 = phi i32 [ %8500, %__nv_exp2f.exit1335 ], [ %5430, %5761 ] + %6021 = phi i32 [ %8501, %__nv_exp2f.exit1335 ], [ %5431, %5761 ] + %6022 = phi i32 [ %8502, %__nv_exp2f.exit1335 ], [ %5432, %5761 ] + %.pn751598 = phi ptr addrspace(1) [ %8475, %__nv_exp2f.exit1335 ], [ %5957, %5761 ] + %.pn911597 = phi ptr addrspace(1) [ %8474, %__nv_exp2f.exit1335 ], [ %5956, %5761 ] + %.pn1071596 = phi ptr addrspace(1) [ %8473, %__nv_exp2f.exit1335 ], [ %5955, %5761 ] + %.pn1231595 = phi ptr addrspace(1) [ %8472, %__nv_exp2f.exit1335 ], [ %5954, %5761 ] + %6023 = phi float [ %7529, %__nv_exp2f.exit1335 ], [ %5826, %5761 ] + %6024 = phi float [ %7530, %__nv_exp2f.exit1335 ], [ %5827, %5761 ] + %6025 = phi float [ %7531, %__nv_exp2f.exit1335 ], [ %5828, %5761 ] + %6026 = phi float [ %7532, %__nv_exp2f.exit1335 ], [ %5829, %5761 ] + %6027 = phi float [ %7533, %__nv_exp2f.exit1335 ], [ %5830, %5761 ] + %6028 = phi float [ %7534, %__nv_exp2f.exit1335 ], [ %5831, %5761 ] + %6029 = phi float [ %7535, %__nv_exp2f.exit1335 ], [ %5832, %5761 ] + %6030 = phi float [ %7536, %__nv_exp2f.exit1335 ], [ %5833, %5761 ] + %6031 = phi float [ %7537, %__nv_exp2f.exit1335 ], [ %5834, %5761 ] + %6032 = phi float [ %7538, %__nv_exp2f.exit1335 ], [ %5835, %5761 ] + %6033 = phi float [ %7539, %__nv_exp2f.exit1335 ], [ %5836, %5761 ] + %6034 = phi float [ %7540, %__nv_exp2f.exit1335 ], [ %5837, %5761 ] + %6035 = phi float [ %7541, %__nv_exp2f.exit1335 ], [ %5838, %5761 ] + %6036 = phi float [ %7542, %__nv_exp2f.exit1335 ], [ %5839, %5761 ] + %6037 = phi float [ %7543, %__nv_exp2f.exit1335 ], [ %5840, %5761 ] + %6038 = phi float [ %7544, %__nv_exp2f.exit1335 ], [ %5841, %5761 ] + %6039 = phi float [ %7545, %__nv_exp2f.exit1335 ], [ %5842, %5761 ] + %6040 = phi float [ %7546, %__nv_exp2f.exit1335 ], [ %5843, %5761 ] + %6041 = phi float [ %7547, %__nv_exp2f.exit1335 ], [ %5844, %5761 ] + %6042 = phi float [ %7548, %__nv_exp2f.exit1335 ], [ %5845, %5761 ] + %6043 = phi float [ %7549, %__nv_exp2f.exit1335 ], [ %5846, %5761 ] + %6044 = phi float [ %7550, %__nv_exp2f.exit1335 ], [ %5847, %5761 ] + %6045 = phi float [ %7551, %__nv_exp2f.exit1335 ], [ %5848, %5761 ] + %6046 = phi float [ %7552, %__nv_exp2f.exit1335 ], [ %5849, %5761 ] + %6047 = phi float [ %7553, %__nv_exp2f.exit1335 ], [ %5850, %5761 ] + %6048 = phi float [ %7554, %__nv_exp2f.exit1335 ], [ %5851, %5761 ] + %6049 = phi float [ %7555, %__nv_exp2f.exit1335 ], [ %5852, %5761 ] + %6050 = phi float [ %7556, %__nv_exp2f.exit1335 ], [ %5853, %5761 ] + %6051 = phi float [ %7557, %__nv_exp2f.exit1335 ], [ %5854, %5761 ] + %6052 = phi float [ %7558, %__nv_exp2f.exit1335 ], [ %5855, %5761 ] + %6053 = phi float [ %7559, %__nv_exp2f.exit1335 ], [ %5856, %5761 ] + %6054 = phi float [ %7560, %__nv_exp2f.exit1335 ], [ %5857, %5761 ] + %6055 = phi float [ %7561, %__nv_exp2f.exit1335 ], [ %5858, %5761 ] + %6056 = phi float [ %7562, %__nv_exp2f.exit1335 ], [ %5859, %5761 ] + %6057 = phi float [ %7563, %__nv_exp2f.exit1335 ], [ %5860, %5761 ] + %6058 = phi float [ %7564, %__nv_exp2f.exit1335 ], [ %5861, %5761 ] + %6059 = phi float [ %7565, %__nv_exp2f.exit1335 ], [ %5862, %5761 ] + %6060 = phi float [ %7566, %__nv_exp2f.exit1335 ], [ %5863, %5761 ] + %6061 = phi float [ %7567, %__nv_exp2f.exit1335 ], [ %5864, %5761 ] + %6062 = phi float [ %7568, %__nv_exp2f.exit1335 ], [ %5865, %5761 ] + %6063 = phi float [ %7569, %__nv_exp2f.exit1335 ], [ %5866, %5761 ] + %6064 = phi float [ %7570, %__nv_exp2f.exit1335 ], [ %5867, %5761 ] + %6065 = phi float [ %7571, %__nv_exp2f.exit1335 ], [ %5868, %5761 ] + %6066 = phi float [ %7572, %__nv_exp2f.exit1335 ], [ %5869, %5761 ] + %6067 = phi float [ %7573, %__nv_exp2f.exit1335 ], [ %5870, %5761 ] + %6068 = phi float [ %7574, %__nv_exp2f.exit1335 ], [ %5871, %5761 ] + %6069 = phi float [ %7575, %__nv_exp2f.exit1335 ], [ %5872, %5761 ] + %6070 = phi float [ %7576, %__nv_exp2f.exit1335 ], [ %5873, %5761 ] + %6071 = phi float [ %7577, %__nv_exp2f.exit1335 ], [ %5874, %5761 ] + %6072 = phi float [ %7578, %__nv_exp2f.exit1335 ], [ %5875, %5761 ] + %6073 = phi float [ %7579, %__nv_exp2f.exit1335 ], [ %5876, %5761 ] + %6074 = phi float [ %7580, %__nv_exp2f.exit1335 ], [ %5877, %5761 ] + %6075 = phi float [ %7581, %__nv_exp2f.exit1335 ], [ %5878, %5761 ] + %6076 = phi float [ %7582, %__nv_exp2f.exit1335 ], [ %5879, %5761 ] + %6077 = phi float [ %7583, %__nv_exp2f.exit1335 ], [ %5880, %5761 ] + %6078 = phi float [ %7584, %__nv_exp2f.exit1335 ], [ %5881, %5761 ] + %6079 = phi float [ %7585, %__nv_exp2f.exit1335 ], [ %5882, %5761 ] + %6080 = phi float [ %7586, %__nv_exp2f.exit1335 ], [ %5883, %5761 ] + %6081 = phi float [ %7587, %__nv_exp2f.exit1335 ], [ %5884, %5761 ] + %6082 = phi float [ %7588, %__nv_exp2f.exit1335 ], [ %5885, %5761 ] + %6083 = phi float [ %7589, %__nv_exp2f.exit1335 ], [ %5886, %5761 ] + %6084 = phi float [ %7590, %__nv_exp2f.exit1335 ], [ %5887, %5761 ] + %6085 = phi float [ %7591, %__nv_exp2f.exit1335 ], [ %5888, %5761 ] + %6086 = phi float [ %7592, %__nv_exp2f.exit1335 ], [ %5889, %5761 ] + %6087 = phi float [ %8397, %__nv_exp2f.exit1335 ], [ %5762, %5761 ] + %6088 = phi float [ %8398, %__nv_exp2f.exit1335 ], [ %5763, %5761 ] + %6089 = phi float [ %8399, %__nv_exp2f.exit1335 ], [ %5764, %5761 ] + %6090 = phi float [ %8400, %__nv_exp2f.exit1335 ], [ %5765, %5761 ] + %6091 = phi float [ %8401, %__nv_exp2f.exit1335 ], [ %5766, %5761 ] + %6092 = phi float [ %8402, %__nv_exp2f.exit1335 ], [ %5767, %5761 ] + %6093 = phi float [ %8403, %__nv_exp2f.exit1335 ], [ %5768, %5761 ] + %6094 = phi float [ %8404, %__nv_exp2f.exit1335 ], [ %5769, %5761 ] + %6095 = phi float [ %8405, %__nv_exp2f.exit1335 ], [ %5770, %5761 ] + %6096 = phi float [ %8406, %__nv_exp2f.exit1335 ], [ %5771, %5761 ] + %6097 = phi float [ %8407, %__nv_exp2f.exit1335 ], [ %5772, %5761 ] + %6098 = phi float [ %8408, %__nv_exp2f.exit1335 ], [ %5773, %5761 ] + %6099 = phi float [ %8409, %__nv_exp2f.exit1335 ], [ %5774, %5761 ] + %6100 = phi float [ %8410, %__nv_exp2f.exit1335 ], [ %5775, %5761 ] + %6101 = phi float [ %8411, %__nv_exp2f.exit1335 ], [ %5776, %5761 ] + %6102 = phi float [ %8412, %__nv_exp2f.exit1335 ], [ %5777, %5761 ] + %6103 = phi float [ %8413, %__nv_exp2f.exit1335 ], [ %5778, %5761 ] + %6104 = phi float [ %8414, %__nv_exp2f.exit1335 ], [ %5779, %5761 ] + %6105 = phi float [ %8415, %__nv_exp2f.exit1335 ], [ %5780, %5761 ] + %6106 = phi float [ %8416, %__nv_exp2f.exit1335 ], [ %5781, %5761 ] + %6107 = phi float [ %8417, %__nv_exp2f.exit1335 ], [ %5782, %5761 ] + %6108 = phi float [ %8418, %__nv_exp2f.exit1335 ], [ %5783, %5761 ] + %6109 = phi float [ %8419, %__nv_exp2f.exit1335 ], [ %5784, %5761 ] + %6110 = phi float [ %8420, %__nv_exp2f.exit1335 ], [ %5785, %5761 ] + %6111 = phi float [ %8421, %__nv_exp2f.exit1335 ], [ %5786, %5761 ] + %6112 = phi float [ %8422, %__nv_exp2f.exit1335 ], [ %5787, %5761 ] + %6113 = phi float [ %8423, %__nv_exp2f.exit1335 ], [ %5788, %5761 ] + %6114 = phi float [ %8424, %__nv_exp2f.exit1335 ], [ %5789, %5761 ] + %6115 = phi float [ %8425, %__nv_exp2f.exit1335 ], [ %5790, %5761 ] + %6116 = phi float [ %8426, %__nv_exp2f.exit1335 ], [ %5791, %5761 ] + %6117 = phi float [ %8427, %__nv_exp2f.exit1335 ], [ %5792, %5761 ] + %6118 = phi float [ %8428, %__nv_exp2f.exit1335 ], [ %5793, %5761 ] + %6119 = phi float [ %8429, %__nv_exp2f.exit1335 ], [ %5794, %5761 ] + %6120 = phi float [ %8430, %__nv_exp2f.exit1335 ], [ %5795, %5761 ] + %6121 = phi float [ %8431, %__nv_exp2f.exit1335 ], [ %5796, %5761 ] + %6122 = phi float [ %8432, %__nv_exp2f.exit1335 ], [ %5797, %5761 ] + %6123 = phi float [ %8433, %__nv_exp2f.exit1335 ], [ %5798, %5761 ] + %6124 = phi float [ %8434, %__nv_exp2f.exit1335 ], [ %5799, %5761 ] + %6125 = phi float [ %8435, %__nv_exp2f.exit1335 ], [ %5800, %5761 ] + %6126 = phi float [ %8436, %__nv_exp2f.exit1335 ], [ %5801, %5761 ] + %6127 = phi float [ %8437, %__nv_exp2f.exit1335 ], [ %5802, %5761 ] + %6128 = phi float [ %8438, %__nv_exp2f.exit1335 ], [ %5803, %5761 ] + %6129 = phi float [ %8439, %__nv_exp2f.exit1335 ], [ %5804, %5761 ] + %6130 = phi float [ %8440, %__nv_exp2f.exit1335 ], [ %5805, %5761 ] + %6131 = phi float [ %8441, %__nv_exp2f.exit1335 ], [ %5806, %5761 ] + %6132 = phi float [ %8442, %__nv_exp2f.exit1335 ], [ %5807, %5761 ] + %6133 = phi float [ %8443, %__nv_exp2f.exit1335 ], [ %5808, %5761 ] + %6134 = phi float [ %8444, %__nv_exp2f.exit1335 ], [ %5809, %5761 ] + %6135 = phi float [ %8445, %__nv_exp2f.exit1335 ], [ %5810, %5761 ] + %6136 = phi float [ %8446, %__nv_exp2f.exit1335 ], [ %5811, %5761 ] + %6137 = phi float [ %8447, %__nv_exp2f.exit1335 ], [ %5812, %5761 ] + %6138 = phi float [ %8448, %__nv_exp2f.exit1335 ], [ %5813, %5761 ] + %6139 = phi float [ %8449, %__nv_exp2f.exit1335 ], [ %5814, %5761 ] + %6140 = phi float [ %8450, %__nv_exp2f.exit1335 ], [ %5815, %5761 ] + %6141 = phi float [ %8451, %__nv_exp2f.exit1335 ], [ %5816, %5761 ] + %6142 = phi float [ %8452, %__nv_exp2f.exit1335 ], [ %5817, %5761 ] + %6143 = phi float [ %8453, %__nv_exp2f.exit1335 ], [ %5818, %5761 ] + %6144 = phi float [ %8454, %__nv_exp2f.exit1335 ], [ %5819, %5761 ] + %6145 = phi float [ %8455, %__nv_exp2f.exit1335 ], [ %5820, %5761 ] + %6146 = phi float [ %8456, %__nv_exp2f.exit1335 ], [ %5821, %5761 ] + %6147 = phi float [ %8457, %__nv_exp2f.exit1335 ], [ %5822, %5761 ] + %6148 = phi float [ %8458, %__nv_exp2f.exit1335 ], [ %5823, %5761 ] + %6149 = phi float [ %8459, %__nv_exp2f.exit1335 ], [ %5824, %5761 ] + %6150 = phi float [ %8460, %__nv_exp2f.exit1335 ], [ %5825, %5761 ] + %6151 = phi i32 [ %7724, %__nv_exp2f.exit1335 ], [ 0, %5761 ] + %6152 = phi <2 x i32> [ %7715, %__nv_exp2f.exit1335 ], [ %5403, %5761 ] + %6153 = phi <2 x i32> [ %6154, %__nv_exp2f.exit1335 ], [ %5403, %5761 ] + %6154 = phi <2 x i32> [ %8484, %__nv_exp2f.exit1335 ], [ %5414, %5761 ] + %6155 = phi <8 x i32> [ %7723, %__nv_exp2f.exit1335 ], [ %5174, %5761 ] + %6156 = phi <4 x i32> [ %7720, %__nv_exp2f.exit1335 ], [ %5146, %5761 ] + %6157 = icmp slt i32 %6151, %5551, !dbg !255 + %6158 = icmp slt i32 %6151, %5552, !dbg !255 + %6159 = add i32 %6011, 1, !dbg !255 + %6160 = icmp sgt i32 %6159, 1, !dbg !255 + %6161 = select i1 %6160, i32 0, i32 %6159, !dbg !255 + %6162 = add i32 %6013, 1, !dbg !255 + %6163 = icmp sgt i32 %6162, 2, !dbg !255 + %6164 = select i1 %6163, i32 0, i32 %6162, !dbg !255 + %6165 = icmp slt <2 x i32> %6153, %5739, !dbg !301 + %6166 = icmp slt i32 %5997, %18, !dbg !301 + %6167 = icmp slt i32 %5998, %18, !dbg !301 + %6168 = icmp slt i32 %5999, %18, !dbg !301 + %6169 = icmp slt i32 %6000, %18, !dbg !301 + %6170 = icmp slt i32 %6001, %18, !dbg !301 + %6171 = icmp slt i32 %6002, %18, !dbg !301 + %6172 = icmp slt i32 %6003, %18, !dbg !301 + %6173 = icmp slt i32 %6004, %18, !dbg !301 + %6174 = icmp slt i32 %6005, %18, !dbg !301 + %6175 = icmp slt i32 %6006, %18, !dbg !301 + %6176 = icmp slt i32 %6007, %18, !dbg !301 + %6177 = icmp slt i32 %6008, %18, !dbg !301 + %6178 = icmp slt i32 %6009, %18, !dbg !301 + %6179 = icmp slt i32 %6010, %18, !dbg !301 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !293 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !293 + %6180 = shl i32 %6164, 13, !dbg !293 + %6181 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %6180, !dbg !293 + %6182 = shl i32 %6161, 6, !dbg !295 + %6183 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %6182, !dbg !295 + %6184 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5332, !dbg !295 + %6185 = load float, ptr addrspace(3) %6184, align 8, !dbg !295 + %6186 = getelementptr inbounds nuw i8, ptr addrspace(3) %6184, i32 4, !dbg !295 + %6187 = load float, ptr addrspace(3) %6186, align 4, !dbg !295 + %6188 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5336, !dbg !295 + %6189 = load float, ptr addrspace(3) %6188, align 8, !dbg !295 + %6190 = getelementptr inbounds nuw i8, ptr addrspace(3) %6188, i32 4, !dbg !295 + %6191 = load float, ptr addrspace(3) %6190, align 4, !dbg !295 + %6192 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5342, !dbg !295 + %6193 = load float, ptr addrspace(3) %6192, align 8, !dbg !295 + %6194 = getelementptr inbounds nuw i8, ptr addrspace(3) %6192, i32 4, !dbg !295 + %6195 = load float, ptr addrspace(3) %6194, align 4, !dbg !295 + %6196 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5348, !dbg !295 + %6197 = load float, ptr addrspace(3) %6196, align 8, !dbg !295 + %6198 = getelementptr inbounds nuw i8, ptr addrspace(3) %6196, i32 4, !dbg !295 + %6199 = load float, ptr addrspace(3) %6198, align 4, !dbg !295 + %6200 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5354, !dbg !295 + %6201 = load float, ptr addrspace(3) %6200, align 8, !dbg !295 + %6202 = getelementptr inbounds nuw i8, ptr addrspace(3) %6200, i32 4, !dbg !295 + %6203 = load float, ptr addrspace(3) %6202, align 4, !dbg !295 + %6204 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5360, !dbg !295 + %6205 = load float, ptr addrspace(3) %6204, align 8, !dbg !295 + %6206 = getelementptr inbounds nuw i8, ptr addrspace(3) %6204, i32 4, !dbg !295 + %6207 = load float, ptr addrspace(3) %6206, align 4, !dbg !295 + %6208 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5366, !dbg !295 + %6209 = load float, ptr addrspace(3) %6208, align 8, !dbg !295 + %6210 = getelementptr inbounds nuw i8, ptr addrspace(3) %6208, i32 4, !dbg !295 + %6211 = load float, ptr addrspace(3) %6210, align 4, !dbg !295 + %6212 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5372, !dbg !295 + %6213 = load float, ptr addrspace(3) %6212, align 8, !dbg !295 + %6214 = getelementptr inbounds nuw i8, ptr addrspace(3) %6212, i32 4, !dbg !295 + %6215 = load float, ptr addrspace(3) %6214, align 4, !dbg !295 + %6216 = fcmp oeq float %6185, 0xFFF0000000000000, !dbg !302 + %6217 = fcmp oeq float %6187, 0xFFF0000000000000, !dbg !302 + %6218 = fcmp oeq float %6189, 0xFFF0000000000000, !dbg !302 + %6219 = fcmp oeq float %6191, 0xFFF0000000000000, !dbg !302 + %6220 = fcmp oeq float %6193, 0xFFF0000000000000, !dbg !302 + %6221 = fcmp oeq float %6195, 0xFFF0000000000000, !dbg !302 + %6222 = fcmp oeq float %6197, 0xFFF0000000000000, !dbg !302 + %6223 = fcmp oeq float %6199, 0xFFF0000000000000, !dbg !302 + %6224 = fcmp oeq float %6201, 0xFFF0000000000000, !dbg !302 + %6225 = fcmp oeq float %6203, 0xFFF0000000000000, !dbg !302 + %6226 = fcmp oeq float %6205, 0xFFF0000000000000, !dbg !302 + %6227 = fcmp oeq float %6207, 0xFFF0000000000000, !dbg !302 + %6228 = fcmp oeq float %6209, 0xFFF0000000000000, !dbg !302 + %6229 = fcmp oeq float %6211, 0xFFF0000000000000, !dbg !302 + %6230 = fcmp oeq float %6213, 0xFFF0000000000000, !dbg !302 + %6231 = fcmp oeq float %6215, 0xFFF0000000000000, !dbg !302 + %6232 = select i1 %6216, float 0.000000e+00, float %6185, !dbg !303 + %6233 = select i1 %6217, float 0.000000e+00, float %6187, !dbg !303 + %6234 = select i1 %6218, float 0.000000e+00, float %6189, !dbg !303 + %6235 = select i1 %6219, float 0.000000e+00, float %6191, !dbg !303 + %6236 = select i1 %6220, float 0.000000e+00, float %6193, !dbg !303 + %6237 = select i1 %6221, float 0.000000e+00, float %6195, !dbg !303 + %6238 = select i1 %6222, float 0.000000e+00, float %6197, !dbg !303 + %6239 = select i1 %6223, float 0.000000e+00, float %6199, !dbg !303 + %6240 = select i1 %6224, float 0.000000e+00, float %6201, !dbg !303 + %6241 = select i1 %6225, float 0.000000e+00, float %6203, !dbg !303 + %6242 = select i1 %6226, float 0.000000e+00, float %6205, !dbg !303 + %6243 = select i1 %6227, float 0.000000e+00, float %6207, !dbg !303 + %6244 = select i1 %6228, float 0.000000e+00, float %6209, !dbg !303 + %6245 = select i1 %6229, float 0.000000e+00, float %6211, !dbg !303 + %6246 = select i1 %6230, float 0.000000e+00, float %6213, !dbg !303 + %6247 = select i1 %6231, float 0.000000e+00, float %6215, !dbg !303 + %6248 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !275 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !275 + %6249 = shl i32 %6248, 11, !dbg !275 + %6250 = and i32 %6249, 8192, !dbg !275 + %6251 = add i32 %6250, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6252 = lshr exact i32 %6251, 4, !dbg !275 + %6253 = and i32 %6252, 16383, !dbg !275 + %6254 = zext nneg i32 %6253 to i64, !dbg !275 + %6255 = or disjoint i64 %6254, 4611686293372403712, !dbg !275 + %6256 = ptrtoint ptr addrspace(3) %6181 to i32, !dbg !275 + %6257 = lshr exact i32 %6256, 4, !dbg !275 + %6258 = and i32 %6257, 16383, !dbg !275 + %6259 = zext nneg i32 %6258 to i64, !dbg !275 + %6260 = or disjoint i64 %6259, 4611686293338849280, !dbg !275 + %6261 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %6255, i64 %6260) #3, !dbg !275 + %6262 = or disjoint i32 %6250, 32, !dbg !275 + %6263 = add i32 %6262, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6264 = lshr exact i32 %6263, 4, !dbg !275 + %6265 = and i32 %6264, 16383, !dbg !275 + %6266 = zext nneg i32 %6265 to i64, !dbg !275 + %6267 = or disjoint i64 %6266, 4611686293372403712, !dbg !275 + %6268 = add i32 %6256, 32, !dbg !275 + %6269 = lshr exact i32 %6268, 4, !dbg !275 + %6270 = and i32 %6269, 16383, !dbg !275 + %6271 = zext nneg i32 %6270 to i64, !dbg !275 + %6272 = or disjoint i64 %6271, 4611686293338849280, !dbg !275 + %6273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 0, !dbg !275 + %6274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 1, !dbg !275 + %6275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 2, !dbg !275 + %6276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 3, !dbg !275 + %6277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 4, !dbg !275 + %6278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 5, !dbg !275 + %6279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 6, !dbg !275 + %6280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 7, !dbg !275 + %6281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 8, !dbg !275 + %6282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 9, !dbg !275 + %6283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 10, !dbg !275 + %6284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 11, !dbg !275 + %6285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 12, !dbg !275 + %6286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 13, !dbg !275 + %6287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 14, !dbg !275 + %6288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 15, !dbg !275 + %6289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 16, !dbg !275 + %6290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 17, !dbg !275 + %6291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 18, !dbg !275 + %6292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 19, !dbg !275 + %6293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 20, !dbg !275 + %6294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 21, !dbg !275 + %6295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 22, !dbg !275 + %6296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 23, !dbg !275 + %6297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 24, !dbg !275 + %6298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 25, !dbg !275 + %6299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 26, !dbg !275 + %6300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 27, !dbg !275 + %6301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 28, !dbg !275 + %6302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 29, !dbg !275 + %6303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 30, !dbg !275 + %6304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 31, !dbg !275 + %6305 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6273, float %6274, float %6275, float %6276, float %6277, float %6278, float %6279, float %6280, float %6281, float %6282, float %6283, float %6284, float %6285, float %6286, float %6287, float %6288, float %6289, float %6290, float %6291, float %6292, float %6293, float %6294, float %6295, float %6296, float %6297, float %6298, float %6299, float %6300, float %6301, float %6302, float %6303, float %6304, i64 %6267, i64 %6272, i1 true) #3, !dbg !275 + %6306 = or disjoint i32 %6250, 64, !dbg !275 + %6307 = add i32 %6306, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6308 = lshr exact i32 %6307, 4, !dbg !275 + %6309 = and i32 %6308, 16383, !dbg !275 + %6310 = zext nneg i32 %6309 to i64, !dbg !275 + %6311 = or disjoint i64 %6310, 4611686293372403712, !dbg !275 + %6312 = add i32 %6256, 64, !dbg !275 + %6313 = lshr exact i32 %6312, 4, !dbg !275 + %6314 = and i32 %6313, 16383, !dbg !275 + %6315 = zext nneg i32 %6314 to i64, !dbg !275 + %6316 = or disjoint i64 %6315, 4611686293338849280, !dbg !275 + %6317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 0, !dbg !275 + %6318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 1, !dbg !275 + %6319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 2, !dbg !275 + %6320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 3, !dbg !275 + %6321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 4, !dbg !275 + %6322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 5, !dbg !275 + %6323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 6, !dbg !275 + %6324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 7, !dbg !275 + %6325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 8, !dbg !275 + %6326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 9, !dbg !275 + %6327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 10, !dbg !275 + %6328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 11, !dbg !275 + %6329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 12, !dbg !275 + %6330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 13, !dbg !275 + %6331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 14, !dbg !275 + %6332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 15, !dbg !275 + %6333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 16, !dbg !275 + %6334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 17, !dbg !275 + %6335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 18, !dbg !275 + %6336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 19, !dbg !275 + %6337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 20, !dbg !275 + %6338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 21, !dbg !275 + %6339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 22, !dbg !275 + %6340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 23, !dbg !275 + %6341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 24, !dbg !275 + %6342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 25, !dbg !275 + %6343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 26, !dbg !275 + %6344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 27, !dbg !275 + %6345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 28, !dbg !275 + %6346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 29, !dbg !275 + %6347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 30, !dbg !275 + %6348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 31, !dbg !275 + %6349 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6317, float %6318, float %6319, float %6320, float %6321, float %6322, float %6323, float %6324, float %6325, float %6326, float %6327, float %6328, float %6329, float %6330, float %6331, float %6332, float %6333, float %6334, float %6335, float %6336, float %6337, float %6338, float %6339, float %6340, float %6341, float %6342, float %6343, float %6344, float %6345, float %6346, float %6347, float %6348, i64 %6311, i64 %6316, i1 true) #3, !dbg !275 + %6350 = or disjoint i32 %6250, 96, !dbg !275 + %6351 = add i32 %6350, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6352 = lshr exact i32 %6351, 4, !dbg !275 + %6353 = and i32 %6352, 16383, !dbg !275 + %6354 = zext nneg i32 %6353 to i64, !dbg !275 + %6355 = or disjoint i64 %6354, 4611686293372403712, !dbg !275 + %6356 = add i32 %6256, 96, !dbg !275 + %6357 = lshr exact i32 %6356, 4, !dbg !275 + %6358 = and i32 %6357, 16383, !dbg !275 + %6359 = zext nneg i32 %6358 to i64, !dbg !275 + %6360 = or disjoint i64 %6359, 4611686293338849280, !dbg !275 + %6361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 0, !dbg !275 + %6362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 1, !dbg !275 + %6363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 2, !dbg !275 + %6364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 3, !dbg !275 + %6365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 4, !dbg !275 + %6366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 5, !dbg !275 + %6367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 6, !dbg !275 + %6368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 7, !dbg !275 + %6369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 8, !dbg !275 + %6370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 9, !dbg !275 + %6371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 10, !dbg !275 + %6372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 11, !dbg !275 + %6373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 12, !dbg !275 + %6374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 13, !dbg !275 + %6375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 14, !dbg !275 + %6376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 15, !dbg !275 + %6377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 16, !dbg !275 + %6378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 17, !dbg !275 + %6379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 18, !dbg !275 + %6380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 19, !dbg !275 + %6381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 20, !dbg !275 + %6382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 21, !dbg !275 + %6383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 22, !dbg !275 + %6384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 23, !dbg !275 + %6385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 24, !dbg !275 + %6386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 25, !dbg !275 + %6387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 26, !dbg !275 + %6388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 27, !dbg !275 + %6389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 28, !dbg !275 + %6390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 29, !dbg !275 + %6391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 30, !dbg !275 + %6392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 31, !dbg !275 + %6393 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6361, float %6362, float %6363, float %6364, float %6365, float %6366, float %6367, float %6368, float %6369, float %6370, float %6371, float %6372, float %6373, float %6374, float %6375, float %6376, float %6377, float %6378, float %6379, float %6380, float %6381, float %6382, float %6383, float %6384, float %6385, float %6386, float %6387, float %6388, float %6389, float %6390, float %6391, float %6392, i64 %6355, i64 %6360, i1 true) #3, !dbg !275 + %6394 = or disjoint i32 %6250, 16384, !dbg !275 + %6395 = add i32 %6394, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6396 = lshr exact i32 %6395, 4, !dbg !275 + %6397 = and i32 %6396, 16383, !dbg !275 + %6398 = zext nneg i32 %6397 to i64, !dbg !275 + %6399 = or disjoint i64 %6398, 4611686293372403712, !dbg !275 + %6400 = add i32 %6256, 8192, !dbg !275 + %6401 = lshr exact i32 %6400, 4, !dbg !275 + %6402 = and i32 %6401, 16383, !dbg !275 + %6403 = zext nneg i32 %6402 to i64, !dbg !275 + %6404 = or disjoint i64 %6403, 4611686293338849280, !dbg !275 + %6405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 0, !dbg !275 + %6406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 1, !dbg !275 + %6407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 2, !dbg !275 + %6408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 3, !dbg !275 + %6409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 4, !dbg !275 + %6410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 5, !dbg !275 + %6411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 6, !dbg !275 + %6412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 7, !dbg !275 + %6413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 8, !dbg !275 + %6414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 9, !dbg !275 + %6415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 10, !dbg !275 + %6416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 11, !dbg !275 + %6417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 12, !dbg !275 + %6418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 13, !dbg !275 + %6419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 14, !dbg !275 + %6420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 15, !dbg !275 + %6421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 16, !dbg !275 + %6422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 17, !dbg !275 + %6423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 18, !dbg !275 + %6424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 19, !dbg !275 + %6425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 20, !dbg !275 + %6426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 21, !dbg !275 + %6427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 22, !dbg !275 + %6428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 23, !dbg !275 + %6429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 24, !dbg !275 + %6430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 25, !dbg !275 + %6431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 26, !dbg !275 + %6432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 27, !dbg !275 + %6433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 28, !dbg !275 + %6434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 29, !dbg !275 + %6435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 30, !dbg !275 + %6436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 31, !dbg !275 + %6437 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6405, float %6406, float %6407, float %6408, float %6409, float %6410, float %6411, float %6412, float %6413, float %6414, float %6415, float %6416, float %6417, float %6418, float %6419, float %6420, float %6421, float %6422, float %6423, float %6424, float %6425, float %6426, float %6427, float %6428, float %6429, float %6430, float %6431, float %6432, float %6433, float %6434, float %6435, float %6436, i64 %6399, i64 %6404, i1 true) #3, !dbg !275 + %6438 = or disjoint i32 %6250, 16416, !dbg !275 + %6439 = add i32 %6438, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6440 = lshr exact i32 %6439, 4, !dbg !275 + %6441 = and i32 %6440, 16383, !dbg !275 + %6442 = zext nneg i32 %6441 to i64, !dbg !275 + %6443 = or disjoint i64 %6442, 4611686293372403712, !dbg !275 + %6444 = add i32 %6256, 8224, !dbg !275 + %6445 = lshr exact i32 %6444, 4, !dbg !275 + %6446 = and i32 %6445, 16383, !dbg !275 + %6447 = zext nneg i32 %6446 to i64, !dbg !275 + %6448 = or disjoint i64 %6447, 4611686293338849280, !dbg !275 + %6449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 0, !dbg !275 + %6450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 1, !dbg !275 + %6451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 2, !dbg !275 + %6452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 3, !dbg !275 + %6453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 4, !dbg !275 + %6454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 5, !dbg !275 + %6455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 6, !dbg !275 + %6456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 7, !dbg !275 + %6457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 8, !dbg !275 + %6458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 9, !dbg !275 + %6459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 10, !dbg !275 + %6460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 11, !dbg !275 + %6461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 12, !dbg !275 + %6462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 13, !dbg !275 + %6463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 14, !dbg !275 + %6464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 15, !dbg !275 + %6465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 16, !dbg !275 + %6466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 17, !dbg !275 + %6467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 18, !dbg !275 + %6468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 19, !dbg !275 + %6469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 20, !dbg !275 + %6470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 21, !dbg !275 + %6471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 22, !dbg !275 + %6472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 23, !dbg !275 + %6473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 24, !dbg !275 + %6474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 25, !dbg !275 + %6475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 26, !dbg !275 + %6476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 27, !dbg !275 + %6477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 28, !dbg !275 + %6478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 29, !dbg !275 + %6479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 30, !dbg !275 + %6480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 31, !dbg !275 + %6481 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6449, float %6450, float %6451, float %6452, float %6453, float %6454, float %6455, float %6456, float %6457, float %6458, float %6459, float %6460, float %6461, float %6462, float %6463, float %6464, float %6465, float %6466, float %6467, float %6468, float %6469, float %6470, float %6471, float %6472, float %6473, float %6474, float %6475, float %6476, float %6477, float %6478, float %6479, float %6480, i64 %6443, i64 %6448, i1 true) #3, !dbg !275 + %6482 = or disjoint i32 %6250, 16448, !dbg !275 + %6483 = add i32 %6482, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6484 = lshr exact i32 %6483, 4, !dbg !275 + %6485 = and i32 %6484, 16383, !dbg !275 + %6486 = zext nneg i32 %6485 to i64, !dbg !275 + %6487 = or disjoint i64 %6486, 4611686293372403712, !dbg !275 + %6488 = add i32 %6256, 8256, !dbg !275 + %6489 = lshr exact i32 %6488, 4, !dbg !275 + %6490 = and i32 %6489, 16383, !dbg !275 + %6491 = zext nneg i32 %6490 to i64, !dbg !275 + %6492 = or disjoint i64 %6491, 4611686293338849280, !dbg !275 + %6493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 0, !dbg !275 + %6494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 1, !dbg !275 + %6495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 2, !dbg !275 + %6496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 3, !dbg !275 + %6497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 4, !dbg !275 + %6498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 5, !dbg !275 + %6499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 6, !dbg !275 + %6500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 7, !dbg !275 + %6501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 8, !dbg !275 + %6502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 9, !dbg !275 + %6503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 10, !dbg !275 + %6504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 11, !dbg !275 + %6505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 12, !dbg !275 + %6506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 13, !dbg !275 + %6507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 14, !dbg !275 + %6508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 15, !dbg !275 + %6509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 16, !dbg !275 + %6510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 17, !dbg !275 + %6511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 18, !dbg !275 + %6512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 19, !dbg !275 + %6513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 20, !dbg !275 + %6514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 21, !dbg !275 + %6515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 22, !dbg !275 + %6516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 23, !dbg !275 + %6517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 24, !dbg !275 + %6518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 25, !dbg !275 + %6519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 26, !dbg !275 + %6520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 27, !dbg !275 + %6521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 28, !dbg !275 + %6522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 29, !dbg !275 + %6523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 30, !dbg !275 + %6524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 31, !dbg !275 + %6525 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6493, float %6494, float %6495, float %6496, float %6497, float %6498, float %6499, float %6500, float %6501, float %6502, float %6503, float %6504, float %6505, float %6506, float %6507, float %6508, float %6509, float %6510, float %6511, float %6512, float %6513, float %6514, float %6515, float %6516, float %6517, float %6518, float %6519, float %6520, float %6521, float %6522, float %6523, float %6524, i64 %6487, i64 %6492, i1 true) #3, !dbg !275 + %6526 = or disjoint i32 %6250, 16480, !dbg !275 + %6527 = add i32 %6526, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6528 = lshr exact i32 %6527, 4, !dbg !275 + %6529 = and i32 %6528, 16383, !dbg !275 + %6530 = zext nneg i32 %6529 to i64, !dbg !275 + %6531 = or disjoint i64 %6530, 4611686293372403712, !dbg !275 + %6532 = add i32 %6256, 8288, !dbg !275 + %6533 = lshr exact i32 %6532, 4, !dbg !275 + %6534 = and i32 %6533, 16383, !dbg !275 + %6535 = zext nneg i32 %6534 to i64, !dbg !275 + %6536 = or disjoint i64 %6535, 4611686293338849280, !dbg !275 + %6537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 0, !dbg !275 + %6538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 1, !dbg !275 + %6539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 2, !dbg !275 + %6540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 3, !dbg !275 + %6541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 4, !dbg !275 + %6542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 5, !dbg !275 + %6543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 6, !dbg !275 + %6544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 7, !dbg !275 + %6545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 8, !dbg !275 + %6546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 9, !dbg !275 + %6547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 10, !dbg !275 + %6548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 11, !dbg !275 + %6549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 12, !dbg !275 + %6550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 13, !dbg !275 + %6551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 14, !dbg !275 + %6552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 15, !dbg !275 + %6553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 16, !dbg !275 + %6554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 17, !dbg !275 + %6555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 18, !dbg !275 + %6556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 19, !dbg !275 + %6557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 20, !dbg !275 + %6558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 21, !dbg !275 + %6559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 22, !dbg !275 + %6560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 23, !dbg !275 + %6561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 24, !dbg !275 + %6562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 25, !dbg !275 + %6563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 26, !dbg !275 + %6564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 27, !dbg !275 + %6565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 28, !dbg !275 + %6566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 29, !dbg !275 + %6567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 30, !dbg !275 + %6568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 31, !dbg !275 + %6569 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6537, float %6538, float %6539, float %6540, float %6541, float %6542, float %6543, float %6544, float %6545, float %6546, float %6547, float %6548, float %6549, float %6550, float %6551, float %6552, float %6553, float %6554, float %6555, float %6556, float %6557, float %6558, float %6559, float %6560, float %6561, float %6562, float %6563, float %6564, float %6565, float %6566, float %6567, float %6568, i64 %6531, i64 %6536, i1 true) #3, !dbg !275 + %6570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 0, !dbg !275 + %6571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 1, !dbg !275 + %6572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 2, !dbg !275 + %6573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 3, !dbg !275 + %6574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 4, !dbg !275 + %6575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 5, !dbg !275 + %6576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 6, !dbg !275 + %6577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 7, !dbg !275 + %6578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 8, !dbg !275 + %6579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 9, !dbg !275 + %6580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 10, !dbg !275 + %6581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 11, !dbg !275 + %6582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 12, !dbg !275 + %6583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 13, !dbg !275 + %6584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 14, !dbg !275 + %6585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 15, !dbg !275 + %6586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 16, !dbg !275 + %6587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 17, !dbg !275 + %6588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 18, !dbg !275 + %6589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 19, !dbg !275 + %6590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 20, !dbg !275 + %6591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 21, !dbg !275 + %6592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 22, !dbg !275 + %6593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 23, !dbg !275 + %6594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 24, !dbg !275 + %6595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 25, !dbg !275 + %6596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 26, !dbg !275 + %6597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 27, !dbg !275 + %6598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 28, !dbg !275 + %6599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 29, !dbg !275 + %6600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 30, !dbg !275 + %6601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 31, !dbg !275 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !275 + %6602 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %6570, float %6571, float %6572, float %6573, float %6574, float %6575, float %6576, float %6577, float %6578, float %6579, float %6580, float %6581, float %6582, float %6583, float %6584, float %6585, float %6586, float %6587, float %6588, float %6589, float %6590, float %6591, float %6592, float %6593, float %6594, float %6595, float %6596, float %6597, float %6598, float %6599, float %6600, float %6601, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %6181, i32 0, i32 0) #3, !dbg !275 + %6603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 0, !dbg !275 + %6604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 1, !dbg !275 + %6605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 2, !dbg !275 + %6606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 3, !dbg !275 + %6607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 4, !dbg !275 + %6608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 5, !dbg !275 + %6609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 6, !dbg !275 + %6610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 7, !dbg !275 + %6611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 8, !dbg !275 + %6612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 9, !dbg !275 + %6613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 10, !dbg !275 + %6614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 11, !dbg !275 + %6615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 12, !dbg !275 + %6616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 13, !dbg !275 + %6617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 14, !dbg !275 + %6618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 15, !dbg !275 + %6619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 16, !dbg !275 + %6620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 17, !dbg !275 + %6621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 18, !dbg !275 + %6622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 19, !dbg !275 + %6623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 20, !dbg !275 + %6624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 21, !dbg !275 + %6625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 22, !dbg !275 + %6626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 23, !dbg !275 + %6627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 24, !dbg !275 + %6628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 25, !dbg !275 + %6629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 26, !dbg !275 + %6630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 27, !dbg !275 + %6631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 28, !dbg !275 + %6632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 29, !dbg !275 + %6633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 30, !dbg !275 + %6634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 31, !dbg !275 + %6635 = fmul float %6603, 0x3FB6A09E60000000, !dbg !304 + %6636 = fmul float %6604, 0x3FB6A09E60000000, !dbg !304 + %6637 = fmul float %6605, 0x3FB6A09E60000000, !dbg !304 + %6638 = fmul float %6606, 0x3FB6A09E60000000, !dbg !304 + %6639 = fmul float %6607, 0x3FB6A09E60000000, !dbg !304 + %6640 = fmul float %6608, 0x3FB6A09E60000000, !dbg !304 + %6641 = fmul float %6609, 0x3FB6A09E60000000, !dbg !304 + %6642 = fmul float %6610, 0x3FB6A09E60000000, !dbg !304 + %6643 = fmul float %6611, 0x3FB6A09E60000000, !dbg !304 + %6644 = fmul float %6612, 0x3FB6A09E60000000, !dbg !304 + %6645 = fmul float %6613, 0x3FB6A09E60000000, !dbg !304 + %6646 = fmul float %6614, 0x3FB6A09E60000000, !dbg !304 + %6647 = fmul float %6615, 0x3FB6A09E60000000, !dbg !304 + %6648 = fmul float %6616, 0x3FB6A09E60000000, !dbg !304 + %6649 = fmul float %6617, 0x3FB6A09E60000000, !dbg !304 + %6650 = fmul float %6618, 0x3FB6A09E60000000, !dbg !304 + %6651 = fmul float %6619, 0x3FB6A09E60000000, !dbg !304 + %6652 = fmul float %6620, 0x3FB6A09E60000000, !dbg !304 + %6653 = fmul float %6621, 0x3FB6A09E60000000, !dbg !304 + %6654 = fmul float %6622, 0x3FB6A09E60000000, !dbg !304 + %6655 = fmul float %6623, 0x3FB6A09E60000000, !dbg !304 + %6656 = fmul float %6624, 0x3FB6A09E60000000, !dbg !304 + %6657 = fmul float %6625, 0x3FB6A09E60000000, !dbg !304 + %6658 = fmul float %6626, 0x3FB6A09E60000000, !dbg !304 + %6659 = fmul float %6627, 0x3FB6A09E60000000, !dbg !304 + %6660 = fmul float %6628, 0x3FB6A09E60000000, !dbg !304 + %6661 = fmul float %6629, 0x3FB6A09E60000000, !dbg !304 + %6662 = fmul float %6630, 0x3FB6A09E60000000, !dbg !304 + %6663 = fmul float %6631, 0x3FB6A09E60000000, !dbg !304 + %6664 = fmul float %6632, 0x3FB6A09E60000000, !dbg !304 + %6665 = fmul float %6633, 0x3FB6A09E60000000, !dbg !304 + %6666 = fmul float %6634, 0x3FB6A09E60000000, !dbg !304 + %6667 = srem i32 %5994, %18, !dbg !253 + %6668 = srem i32 %5995, %18, !dbg !253 + %6669 = icmp sge i32 %6667, %5183, !dbg !305 + %6670 = icmp sge i32 %6668, %5183, !dbg !305 + %6671 = icmp sge i32 %6667, %5185, !dbg !305 + %6672 = icmp sge i32 %6668, %5185, !dbg !305 + %6673 = sext i32 %6667 to i64, !dbg !306 + %6674 = sext i32 %6668 to i64, !dbg !306 + %6675 = icmp sgt i64 %5182, %6673, !dbg !307 + %6676 = icmp sgt i64 %5182, %6674, !dbg !307 + %6677 = and i1 %6669, %6675, !dbg !308 + %6678 = and i1 %6670, %6676, !dbg !308 + %6679 = and i1 %6671, %6675, !dbg !308 + %6680 = and i1 %6672, %6676, !dbg !308 + %6681 = sub i32 %5183, %6667, !dbg !309 + %6682 = sub i32 %5183, %6668, !dbg !309 + %6683 = sub i32 %5185, %6667, !dbg !309 + %6684 = sub i32 %5185, %6668, !dbg !309 + %6685 = srem i32 %6681, %26, !dbg !310 + %6686 = srem i32 %6682, %26, !dbg !310 + %6687 = srem i32 %6683, %26, !dbg !310 + %6688 = srem i32 %6684, %26, !dbg !310 + %6689 = icmp ne i32 %6685, 0, !dbg !311 + %6690 = icmp ne i32 %6686, 0, !dbg !311 + %6691 = icmp ne i32 %6687, 0, !dbg !311 + %6692 = icmp ne i32 %6688, 0, !dbg !311 + %6693 = shufflevector <4 x i32> %6156, <4 x i32> poison, <8 x i32> , !dbg !253 + %6694 = srem <8 x i32> %6693, %5741, !dbg !253 + %6695 = extractelement <8 x i32> %6694, i64 5, !dbg !306 + %6696 = icmp sge i32 %6695, %5183, !dbg !305 + %6697 = extractelement <8 x i32> %6694, i64 4, !dbg !306 + %6698 = icmp sge i32 %6697, %5183, !dbg !305 + %6699 = icmp sge i32 %6695, %5185, !dbg !305 + %6700 = icmp sge i32 %6697, %5185, !dbg !305 + %6701 = extractelement <8 x i32> %6694, i64 1, !dbg !306 + %6702 = icmp sge i32 %6701, %5183, !dbg !305 + %6703 = extractelement <8 x i32> %6694, i64 0, !dbg !306 + %6704 = icmp sge i32 %6703, %5183, !dbg !305 + %6705 = icmp sge i32 %6701, %5185, !dbg !305 + %6706 = icmp sge i32 %6703, %5185, !dbg !305 + %6707 = sext i32 %6695 to i64, !dbg !306 + %6708 = sext i32 %6697 to i64, !dbg !306 + %6709 = sext i32 %6701 to i64, !dbg !306 + %6710 = sext i32 %6703 to i64, !dbg !306 + %6711 = icmp sgt i64 %5182, %6707, !dbg !307 + %6712 = icmp sgt i64 %5182, %6708, !dbg !307 + %6713 = icmp sgt i64 %5182, %6709, !dbg !307 + %6714 = icmp sgt i64 %5182, %6710, !dbg !307 + %6715 = and i1 %6696, %6711, !dbg !308 + %6716 = and i1 %6698, %6712, !dbg !308 + %6717 = and i1 %6699, %6711, !dbg !308 + %6718 = and i1 %6700, %6712, !dbg !308 + %6719 = and i1 %6702, %6713, !dbg !308 + %6720 = and i1 %6704, %6714, !dbg !308 + %6721 = and i1 %6705, %6713, !dbg !308 + %6722 = and i1 %6706, %6714, !dbg !308 + %6723 = sub <8 x i32> %5742, %6694, !dbg !309 + %6724 = srem <8 x i32> %6723, %5744, !dbg !310 + %6725 = icmp ne <8 x i32> %6724, zeroinitializer, !dbg !311 + %6726 = xor i32 %6685, %26, !dbg !312 + %6727 = icmp slt i32 %6726, 0, !dbg !312 + %6728 = xor i32 %6686, %26, !dbg !312 + %6729 = icmp slt i32 %6728, 0, !dbg !312 + %6730 = xor i32 %6687, %26, !dbg !312 + %6731 = icmp slt i32 %6730, 0, !dbg !312 + %6732 = extractelement <8 x i32> %6724, i64 7, !dbg !313 + %6733 = extractelement <8 x i32> %6724, i64 6, !dbg !313 + %6734 = extractelement <8 x i32> %6724, i64 5, !dbg !313 + %6735 = extractelement <8 x i32> %6724, i64 4, !dbg !313 + %6736 = extractelement <8 x i32> %6724, i64 3, !dbg !313 + %6737 = extractelement <8 x i32> %6724, i64 2, !dbg !313 + %6738 = extractelement <8 x i32> %6724, i64 1, !dbg !313 + %6739 = insertelement <8 x i32> %6724, i32 %6688, i64 0, !dbg !312 + %6740 = shufflevector <8 x i32> %6739, <8 x i32> %6724, <8 x i32> , !dbg !312 + %6741 = xor <8 x i32> %6740, %5744, !dbg !312 + %6742 = icmp slt <8 x i32> %6741, zeroinitializer, !dbg !312 + %6743 = extractelement <8 x i32> %6724, i64 0, !dbg !313 + %6744 = xor i32 %6743, %26, !dbg !312 + %6745 = icmp slt i32 %6744, 0, !dbg !312 + %6746 = and i1 %6689, %6727, !dbg !314 + %6747 = and i1 %6690, %6729, !dbg !314 + %6748 = and i1 %6691, %6731, !dbg !314 + %6749 = extractelement <8 x i1> %6742, i64 0, !dbg !314 + %6750 = and i1 %6692, %6749, !dbg !314 + %foldExtExtBinop = and <8 x i1> %6725, %6742, !dbg !314 + %6751 = extractelement <8 x i1> %foldExtExtBinop, i64 7, !dbg !314 + %foldExtExtBinop3025 = and <8 x i1> %6725, %6742, !dbg !314 + %6752 = extractelement <8 x i1> %foldExtExtBinop3025, i64 6, !dbg !314 + %foldExtExtBinop3027 = and <8 x i1> %6725, %6742, !dbg !314 + %6753 = extractelement <8 x i1> %foldExtExtBinop3027, i64 5, !dbg !314 + %foldExtExtBinop3029 = and <8 x i1> %6725, %6742, !dbg !314 + %6754 = extractelement <8 x i1> %foldExtExtBinop3029, i64 4, !dbg !314 + %foldExtExtBinop3031 = and <8 x i1> %6725, %6742, !dbg !314 + %6755 = extractelement <8 x i1> %foldExtExtBinop3031, i64 3, !dbg !314 + %foldExtExtBinop3033 = and <8 x i1> %6725, %6742, !dbg !314 + %6756 = extractelement <8 x i1> %foldExtExtBinop3033, i64 2, !dbg !314 + %foldExtExtBinop3035 = and <8 x i1> %6725, %6742, !dbg !314 + %6757 = extractelement <8 x i1> %foldExtExtBinop3035, i64 1, !dbg !314 + %6758 = extractelement <8 x i1> %6725, i64 0, !dbg !314 + %6759 = and i1 %6758, %6745, !dbg !314 + %6760 = select i1 %6746, i32 %26, i32 0, !dbg !315 + %6761 = select i1 %6747, i32 %26, i32 0, !dbg !315 + %6762 = select i1 %6748, i32 %26, i32 0, !dbg !315 + %6763 = select i1 %6750, i32 %26, i32 0, !dbg !315 + %6764 = select i1 %6751, i32 %26, i32 0, !dbg !315 + %6765 = select i1 %6752, i32 %26, i32 0, !dbg !315 + %6766 = select i1 %6753, i32 %26, i32 0, !dbg !315 + %6767 = select i1 %6754, i32 %26, i32 0, !dbg !315 + %6768 = select i1 %6755, i32 %26, i32 0, !dbg !315 + %6769 = select i1 %6756, i32 %26, i32 0, !dbg !315 + %6770 = select i1 %6757, i32 %26, i32 0, !dbg !315 + %6771 = select i1 %6759, i32 %26, i32 0, !dbg !315 + %6772 = sub i32 0, %6760, !dbg !313 + %6773 = icmp eq i32 %6685, %6772, !dbg !313 + %6774 = sub i32 0, %6761, !dbg !313 + %6775 = icmp eq i32 %6686, %6774, !dbg !313 + %6776 = sub i32 0, %6762, !dbg !313 + %6777 = icmp eq i32 %6687, %6776, !dbg !313 + %6778 = sub i32 0, %6763, !dbg !313 + %6779 = icmp eq i32 %6688, %6778, !dbg !313 + %6780 = sub i32 0, %6764, !dbg !313 + %6781 = icmp eq i32 %6732, %6780, !dbg !313 + %6782 = sub i32 0, %6765, !dbg !313 + %6783 = icmp eq i32 %6733, %6782, !dbg !313 + %6784 = sub i32 0, %6766, !dbg !313 + %6785 = icmp eq i32 %6734, %6784, !dbg !313 + %6786 = sub i32 0, %6767, !dbg !313 + %6787 = icmp eq i32 %6735, %6786, !dbg !313 + %6788 = sub i32 0, %6768, !dbg !313 + %6789 = icmp eq i32 %6736, %6788, !dbg !313 + %6790 = sub i32 0, %6769, !dbg !313 + %6791 = icmp eq i32 %6737, %6790, !dbg !313 + %6792 = sub i32 0, %6770, !dbg !313 + %6793 = icmp eq i32 %6738, %6792, !dbg !313 + %6794 = sub i32 0, %6771, !dbg !313 + %6795 = icmp eq i32 %6743, %6794, !dbg !313 + %6796 = srem <8 x i32> %6155, %5746, !dbg !253 + %6797 = shufflevector <8 x i32> %6796, <8 x i32> poison, <16 x i32> , !dbg !253 + %6798 = extractelement <8 x i32> %6796, i64 7, !dbg !306 + %6799 = icmp sge i32 %6798, %5183, !dbg !305 + %6800 = extractelement <8 x i32> %6796, i64 6, !dbg !306 + %6801 = icmp sge i32 %6800, %5183, !dbg !305 + %6802 = icmp sge i32 %6798, %5185, !dbg !305 + %6803 = icmp sge i32 %6800, %5185, !dbg !305 + %6804 = extractelement <8 x i32> %6796, i64 5, !dbg !306 + %6805 = icmp sge i32 %6804, %5183, !dbg !305 + %6806 = extractelement <8 x i32> %6796, i64 4, !dbg !306 + %6807 = icmp sge i32 %6806, %5183, !dbg !305 + %6808 = icmp sge i32 %6804, %5185, !dbg !305 + %6809 = icmp sge i32 %6806, %5185, !dbg !305 + %6810 = extractelement <8 x i32> %6796, i64 3, !dbg !306 + %6811 = icmp sge i32 %6810, %5183, !dbg !305 + %6812 = extractelement <8 x i32> %6796, i64 2, !dbg !306 + %6813 = icmp sge i32 %6812, %5183, !dbg !305 + %6814 = icmp sge i32 %6810, %5185, !dbg !305 + %6815 = icmp sge i32 %6812, %5185, !dbg !305 + %6816 = extractelement <8 x i32> %6796, i64 1, !dbg !306 + %6817 = icmp sge i32 %6816, %5183, !dbg !305 + %6818 = extractelement <8 x i32> %6796, i64 0, !dbg !306 + %6819 = icmp sge i32 %6818, %5183, !dbg !305 + %6820 = icmp sge i32 %6816, %5185, !dbg !305 + %6821 = icmp sge i32 %6818, %5185, !dbg !305 + %6822 = sext <8 x i32> %6796 to <8 x i64>, !dbg !307 + %6823 = icmp sgt <8 x i64> %5748, %6822, !dbg !307 + %6824 = extractelement <8 x i1> %6823, i64 7, !dbg !308 + %6825 = and i1 %6799, %6824, !dbg !308 + %6826 = extractelement <8 x i1> %6823, i64 6, !dbg !308 + %6827 = and i1 %6801, %6826, !dbg !308 + %6828 = and i1 %6802, %6824, !dbg !308 + %6829 = and i1 %6803, %6826, !dbg !308 + %6830 = extractelement <8 x i1> %6823, i64 5, !dbg !308 + %6831 = and i1 %6805, %6830, !dbg !308 + %6832 = extractelement <8 x i1> %6823, i64 4, !dbg !308 + %6833 = and i1 %6807, %6832, !dbg !308 + %6834 = and i1 %6808, %6830, !dbg !308 + %6835 = and i1 %6809, %6832, !dbg !308 + %6836 = extractelement <8 x i1> %6823, i64 3, !dbg !308 + %6837 = and i1 %6811, %6836, !dbg !308 + %6838 = extractelement <8 x i1> %6823, i64 2, !dbg !308 + %6839 = and i1 %6813, %6838, !dbg !308 + %6840 = and i1 %6814, %6836, !dbg !308 + %6841 = and i1 %6815, %6838, !dbg !308 + %6842 = extractelement <8 x i1> %6823, i64 1, !dbg !308 + %6843 = and i1 %6817, %6842, !dbg !308 + %6844 = extractelement <8 x i1> %6823, i64 0, !dbg !308 + %6845 = and i1 %6819, %6844, !dbg !308 + %6846 = and i1 %6820, %6842, !dbg !308 + %6847 = and i1 %6821, %6844, !dbg !308 + %6848 = sub <16 x i32> %5178, %6797, !dbg !309 + %6849 = srem <16 x i32> %6848, %5750, !dbg !310 + %6850 = icmp ne <16 x i32> %6849, zeroinitializer, !dbg !311 + %6851 = xor <16 x i32> %6849, %5750, !dbg !312 + %6852 = icmp slt <16 x i32> %6851, zeroinitializer, !dbg !312 + %6853 = and <16 x i1> %6850, %6852, !dbg !314 + %6854 = select <16 x i1> %6853, <16 x i32> %5750, <16 x i32> zeroinitializer, !dbg !315 + %6855 = sub <16 x i32> zeroinitializer, %6854, !dbg !313 + %6856 = icmp eq <16 x i32> %6849, %6855, !dbg !313 + %6857 = and i1 %5198, %6773, !dbg !316 + %6858 = and i1 %5198, %6775, !dbg !316 + %6859 = and i1 %5199, %6777, !dbg !316 + %6860 = and i1 %5199, %6779, !dbg !316 + %6861 = and i1 %5198, %6781, !dbg !316 + %6862 = and i1 %5198, %6783, !dbg !316 + %6863 = and i1 %5199, %6785, !dbg !316 + %6864 = and i1 %5199, %6787, !dbg !316 + %6865 = and i1 %5198, %6789, !dbg !316 + %6866 = and i1 %5198, %6791, !dbg !316 + %6867 = and i1 %5199, %6793, !dbg !316 + %6868 = and i1 %5199, %6795, !dbg !316 + %6869 = extractelement <16 x i1> %6856, i64 15, !dbg !316 + %6870 = and i1 %5198, %6869, !dbg !316 + %6871 = extractelement <16 x i1> %6856, i64 14, !dbg !316 + %6872 = and i1 %5198, %6871, !dbg !316 + %6873 = extractelement <16 x i1> %6856, i64 13, !dbg !316 + %6874 = and i1 %5199, %6873, !dbg !316 + %6875 = extractelement <16 x i1> %6856, i64 12, !dbg !316 + %6876 = and i1 %5199, %6875, !dbg !316 + %6877 = extractelement <16 x i1> %6856, i64 11, !dbg !316 + %6878 = and i1 %5198, %6877, !dbg !316 + %6879 = extractelement <16 x i1> %6856, i64 10, !dbg !316 + %6880 = and i1 %5198, %6879, !dbg !316 + %6881 = extractelement <16 x i1> %6856, i64 9, !dbg !316 + %6882 = and i1 %5199, %6881, !dbg !316 + %6883 = extractelement <16 x i1> %6856, i64 8, !dbg !316 + %6884 = and i1 %5199, %6883, !dbg !316 + %6885 = extractelement <16 x i1> %6856, i64 7, !dbg !316 + %6886 = and i1 %5198, %6885, !dbg !316 + %6887 = extractelement <16 x i1> %6856, i64 6, !dbg !316 + %6888 = and i1 %5198, %6887, !dbg !316 + %6889 = extractelement <16 x i1> %6856, i64 5, !dbg !316 + %6890 = and i1 %5199, %6889, !dbg !316 + %6891 = extractelement <16 x i1> %6856, i64 4, !dbg !316 + %6892 = and i1 %5199, %6891, !dbg !316 + %6893 = extractelement <16 x i1> %6856, i64 3, !dbg !316 + %6894 = and i1 %5198, %6893, !dbg !316 + %6895 = extractelement <16 x i1> %6856, i64 2, !dbg !316 + %6896 = and i1 %5198, %6895, !dbg !316 + %6897 = extractelement <16 x i1> %6856, i64 1, !dbg !316 + %6898 = and i1 %5199, %6897, !dbg !316 + %6899 = extractelement <16 x i1> %6856, i64 0, !dbg !316 + %6900 = and i1 %5199, %6899, !dbg !316 + %6901 = or i1 %6677, %6857, !dbg !317 + %6902 = or i1 %6678, %6858, !dbg !317 + %6903 = or i1 %6679, %6859, !dbg !317 + %6904 = or i1 %6680, %6860, !dbg !317 + %6905 = or i1 %6715, %6861, !dbg !317 + %6906 = or i1 %6716, %6862, !dbg !317 + %6907 = or i1 %6717, %6863, !dbg !317 + %6908 = or i1 %6718, %6864, !dbg !317 + %6909 = or i1 %6719, %6865, !dbg !317 + %6910 = or i1 %6720, %6866, !dbg !317 + %6911 = or i1 %6721, %6867, !dbg !317 + %6912 = or i1 %6722, %6868, !dbg !317 + %6913 = or i1 %6825, %6870, !dbg !317 + %6914 = or i1 %6827, %6872, !dbg !317 + %6915 = or i1 %6828, %6874, !dbg !317 + %6916 = or i1 %6829, %6876, !dbg !317 + %6917 = or i1 %6831, %6878, !dbg !317 + %6918 = or i1 %6833, %6880, !dbg !317 + %6919 = or i1 %6834, %6882, !dbg !317 + %6920 = or i1 %6835, %6884, !dbg !317 + %6921 = or i1 %6837, %6886, !dbg !317 + %6922 = or i1 %6839, %6888, !dbg !317 + %6923 = or i1 %6840, %6890, !dbg !317 + %6924 = or i1 %6841, %6892, !dbg !317 + %6925 = or i1 %6843, %6894, !dbg !317 + %6926 = or i1 %6845, %6896, !dbg !317 + %6927 = or i1 %6846, %6898, !dbg !317 + %6928 = or i1 %6847, %6900, !dbg !317 + %6929 = srem <2 x i32> %6152, %5739, !dbg !253 + %6930 = icmp sge <2 x i32> %6929, %5751, !dbg !305 + %6931 = sext <2 x i32> %6929 to <2 x i64>, !dbg !307 + %6932 = icmp sgt <2 x i64> %5753, %6931, !dbg !307 + %6933 = and <2 x i1> %6930, %6932, !dbg !308 + %6934 = sub <2 x i32> %5751, %6929, !dbg !309 + %6935 = srem <2 x i32> %6934, %5755, !dbg !310 + %6936 = icmp ne <2 x i32> %6935, zeroinitializer, !dbg !311 + %6937 = xor <2 x i32> %6935, %5755, !dbg !312 + %6938 = icmp slt <2 x i32> %6937, zeroinitializer, !dbg !312 + %6939 = and <2 x i1> %6936, %6938, !dbg !314 + %6940 = select <2 x i1> %6939, <2 x i32> %5755, <2 x i32> zeroinitializer, !dbg !315 + %6941 = sub <2 x i32> zeroinitializer, %6940, !dbg !313 + %6942 = icmp eq <2 x i32> %6935, %6941, !dbg !313 + %6943 = and <2 x i1> %5757, %6942, !dbg !316 + %6944 = or <2 x i1> %6933, %6943, !dbg !317 + %6945 = icmp sge <2 x i32> %6929, %5758, !dbg !305 + %6946 = and <2 x i1> %6945, %6932, !dbg !308 + %6947 = sub <2 x i32> %5758, %6929, !dbg !309 + %6948 = srem <2 x i32> %6947, %5755, !dbg !310 + %6949 = icmp ne <2 x i32> %6948, zeroinitializer, !dbg !311 + %6950 = xor <2 x i32> %6948, %5755, !dbg !312 + %6951 = icmp slt <2 x i32> %6950, zeroinitializer, !dbg !312 + %6952 = and <2 x i1> %6949, %6951, !dbg !314 + %6953 = select <2 x i1> %6952, <2 x i32> %5755, <2 x i32> zeroinitializer, !dbg !315 + %6954 = sub <2 x i32> zeroinitializer, %6953, !dbg !313 + %6955 = icmp eq <2 x i32> %6948, %6954, !dbg !313 + %6956 = and <2 x i1> %5760, %6955, !dbg !316 + %6957 = or <2 x i1> %6946, %6956, !dbg !317 + %6958 = select <2 x i1> %6957, <2 x i1> %6165, <2 x i1> zeroinitializer, !dbg !318 + %6959 = select <2 x i1> %6944, <2 x i1> %6165, <2 x i1> zeroinitializer, !dbg !318 + %6960 = select i1 %6901, i1 %6166, i1 false, !dbg !318 + %6961 = select i1 %6902, i1 %6167, i1 false, !dbg !318 + %6962 = select i1 %6903, i1 %6166, i1 false, !dbg !318 + %6963 = select i1 %6904, i1 %6167, i1 false, !dbg !318 + %6964 = select i1 %6905, i1 %6168, i1 false, !dbg !318 + %6965 = select i1 %6906, i1 %6169, i1 false, !dbg !318 + %6966 = select i1 %6907, i1 %6168, i1 false, !dbg !318 + %6967 = select i1 %6908, i1 %6169, i1 false, !dbg !318 + %6968 = select i1 %6909, i1 %6170, i1 false, !dbg !318 + %6969 = select i1 %6910, i1 %6171, i1 false, !dbg !318 + %6970 = select i1 %6911, i1 %6170, i1 false, !dbg !318 + %6971 = select i1 %6912, i1 %6171, i1 false, !dbg !318 + %6972 = select i1 %6913, i1 %6172, i1 false, !dbg !318 + %6973 = select i1 %6914, i1 %6173, i1 false, !dbg !318 + %6974 = select i1 %6915, i1 %6172, i1 false, !dbg !318 + %6975 = select i1 %6916, i1 %6173, i1 false, !dbg !318 + %6976 = select i1 %6917, i1 %6174, i1 false, !dbg !318 + %6977 = select i1 %6918, i1 %6175, i1 false, !dbg !318 + %6978 = select i1 %6919, i1 %6174, i1 false, !dbg !318 + %6979 = select i1 %6920, i1 %6175, i1 false, !dbg !318 + %6980 = select i1 %6921, i1 %6176, i1 false, !dbg !318 + %6981 = select i1 %6922, i1 %6177, i1 false, !dbg !318 + %6982 = select i1 %6923, i1 %6176, i1 false, !dbg !318 + %6983 = select i1 %6924, i1 %6177, i1 false, !dbg !318 + %6984 = select i1 %6925, i1 %6178, i1 false, !dbg !318 + %6985 = select i1 %6926, i1 %6179, i1 false, !dbg !318 + %6986 = select i1 %6927, i1 %6178, i1 false, !dbg !318 + %6987 = select i1 %6928, i1 %6179, i1 false, !dbg !318 + %6988 = fmul float %6635, 0x3FF7154760000000, !dbg !319 + %6989 = extractelement <2 x i1> %6958, i64 0, !dbg !318 + %6990 = select i1 %6989, float %6988, float 0xFFF0000000000000, !dbg !318 + %6991 = fmul float %6636, 0x3FF7154760000000, !dbg !319 + %6992 = extractelement <2 x i1> %6958, i64 1, !dbg !318 + %6993 = select i1 %6992, float %6991, float 0xFFF0000000000000, !dbg !318 + %6994 = fmul float %6637, 0x3FF7154760000000, !dbg !319 + %6995 = extractelement <2 x i1> %6959, i64 0, !dbg !318 + %6996 = select i1 %6995, float %6994, float 0xFFF0000000000000, !dbg !318 + %6997 = fmul float %6638, 0x3FF7154760000000, !dbg !319 + %6998 = extractelement <2 x i1> %6959, i64 1, !dbg !318 + %6999 = select i1 %6998, float %6997, float 0xFFF0000000000000, !dbg !318 + %7000 = fmul float %6639, 0x3FF7154760000000, !dbg !319 + %7001 = select i1 %6960, float %7000, float 0xFFF0000000000000, !dbg !318 + %7002 = fmul float %6640, 0x3FF7154760000000, !dbg !319 + %7003 = select i1 %6961, float %7002, float 0xFFF0000000000000, !dbg !318 + %7004 = fmul float %6641, 0x3FF7154760000000, !dbg !319 + %7005 = select i1 %6962, float %7004, float 0xFFF0000000000000, !dbg !318 + %7006 = fmul float %6642, 0x3FF7154760000000, !dbg !319 + %7007 = select i1 %6963, float %7006, float 0xFFF0000000000000, !dbg !318 + %7008 = fmul float %6643, 0x3FF7154760000000, !dbg !319 + %7009 = select i1 %6964, float %7008, float 0xFFF0000000000000, !dbg !318 + %7010 = fmul float %6644, 0x3FF7154760000000, !dbg !319 + %7011 = select i1 %6965, float %7010, float 0xFFF0000000000000, !dbg !318 + %7012 = fmul float %6645, 0x3FF7154760000000, !dbg !319 + %7013 = select i1 %6966, float %7012, float 0xFFF0000000000000, !dbg !318 + %7014 = fmul float %6646, 0x3FF7154760000000, !dbg !319 + %7015 = select i1 %6967, float %7014, float 0xFFF0000000000000, !dbg !318 + %7016 = fmul float %6647, 0x3FF7154760000000, !dbg !319 + %7017 = select i1 %6968, float %7016, float 0xFFF0000000000000, !dbg !318 + %7018 = fmul float %6648, 0x3FF7154760000000, !dbg !319 + %7019 = select i1 %6969, float %7018, float 0xFFF0000000000000, !dbg !318 + %7020 = fmul float %6649, 0x3FF7154760000000, !dbg !319 + %7021 = select i1 %6970, float %7020, float 0xFFF0000000000000, !dbg !318 + %7022 = fmul float %6650, 0x3FF7154760000000, !dbg !319 + %7023 = select i1 %6971, float %7022, float 0xFFF0000000000000, !dbg !318 + %7024 = fmul float %6651, 0x3FF7154760000000, !dbg !319 + %7025 = select i1 %6972, float %7024, float 0xFFF0000000000000, !dbg !318 + %7026 = fmul float %6652, 0x3FF7154760000000, !dbg !319 + %7027 = select i1 %6973, float %7026, float 0xFFF0000000000000, !dbg !318 + %7028 = fmul float %6653, 0x3FF7154760000000, !dbg !319 + %7029 = select i1 %6974, float %7028, float 0xFFF0000000000000, !dbg !318 + %7030 = fmul float %6654, 0x3FF7154760000000, !dbg !319 + %7031 = select i1 %6975, float %7030, float 0xFFF0000000000000, !dbg !318 + %7032 = fmul float %6655, 0x3FF7154760000000, !dbg !319 + %7033 = select i1 %6976, float %7032, float 0xFFF0000000000000, !dbg !318 + %7034 = fmul float %6656, 0x3FF7154760000000, !dbg !319 + %7035 = select i1 %6977, float %7034, float 0xFFF0000000000000, !dbg !318 + %7036 = fmul float %6657, 0x3FF7154760000000, !dbg !319 + %7037 = select i1 %6978, float %7036, float 0xFFF0000000000000, !dbg !318 + %7038 = fmul float %6658, 0x3FF7154760000000, !dbg !319 + %7039 = select i1 %6979, float %7038, float 0xFFF0000000000000, !dbg !318 + %7040 = fmul float %6659, 0x3FF7154760000000, !dbg !319 + %7041 = select i1 %6980, float %7040, float 0xFFF0000000000000, !dbg !318 + %7042 = fmul float %6660, 0x3FF7154760000000, !dbg !319 + %7043 = select i1 %6981, float %7042, float 0xFFF0000000000000, !dbg !318 + %7044 = fmul float %6661, 0x3FF7154760000000, !dbg !319 + %7045 = select i1 %6982, float %7044, float 0xFFF0000000000000, !dbg !318 + %7046 = fmul float %6662, 0x3FF7154760000000, !dbg !319 + %7047 = select i1 %6983, float %7046, float 0xFFF0000000000000, !dbg !318 + %7048 = fmul float %6663, 0x3FF7154760000000, !dbg !319 + %7049 = select i1 %6984, float %7048, float 0xFFF0000000000000, !dbg !318 + %7050 = fmul float %6664, 0x3FF7154760000000, !dbg !319 + %7051 = select i1 %6985, float %7050, float 0xFFF0000000000000, !dbg !318 + %7052 = fmul float %6665, 0x3FF7154760000000, !dbg !319 + %7053 = select i1 %6986, float %7052, float 0xFFF0000000000000, !dbg !318 + %7054 = fmul float %6666, 0x3FF7154760000000, !dbg !319 + %7055 = select i1 %6987, float %7054, float 0xFFF0000000000000, !dbg !318 + %7056 = fsub float %6990, %6232, !dbg !320 + %7057 = fsub float %6993, %6233, !dbg !320 + %7058 = fsub float %6996, %6232, !dbg !320 + %7059 = fsub float %6999, %6233, !dbg !320 + %7060 = fsub float %7001, %6234, !dbg !320 + %7061 = fsub float %7003, %6235, !dbg !320 + %7062 = fsub float %7005, %6234, !dbg !320 + %7063 = fsub float %7007, %6235, !dbg !320 + %7064 = fsub float %7009, %6236, !dbg !320 + %7065 = fsub float %7011, %6237, !dbg !320 + %7066 = fsub float %7013, %6236, !dbg !320 + %7067 = fsub float %7015, %6237, !dbg !320 + %7068 = fsub float %7017, %6238, !dbg !320 + %7069 = fsub float %7019, %6239, !dbg !320 + %7070 = fsub float %7021, %6238, !dbg !320 + %7071 = fsub float %7023, %6239, !dbg !320 + %7072 = fsub float %7025, %6240, !dbg !320 + %7073 = fsub float %7027, %6241, !dbg !320 + %7074 = fsub float %7029, %6240, !dbg !320 + %7075 = fsub float %7031, %6241, !dbg !320 + %7076 = fsub float %7033, %6242, !dbg !320 + %7077 = fsub float %7035, %6243, !dbg !320 + %7078 = fsub float %7037, %6242, !dbg !320 + %7079 = fsub float %7039, %6243, !dbg !320 + %7080 = fsub float %7041, %6244, !dbg !320 + %7081 = fsub float %7043, %6245, !dbg !320 + %7082 = fsub float %7045, %6244, !dbg !320 + %7083 = fsub float %7047, %6245, !dbg !320 + %7084 = fsub float %7049, %6246, !dbg !320 + %7085 = fsub float %7051, %6247, !dbg !320 + %7086 = fsub float %7053, %6246, !dbg !320 + %7087 = fsub float %7055, %6247, !dbg !320 + %7088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1240 = icmp eq i32 %7088, 0, !dbg !321 + br i1 %.not.i1240, label %7091, label %7089, !dbg !321 + +7089: ; preds = %.lr.ph1620 + %7090 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7056) #3, !dbg !321 + br label %__nv_exp2f.exit1242, !dbg !321 + +7091: ; preds = %.lr.ph1620 + %7092 = tail call float @llvm.nvvm.ex2.approx.f(float %7056) #3, !dbg !321 + br label %__nv_exp2f.exit1242, !dbg !321 + +__nv_exp2f.exit1242: ; preds = %7089, %7091 + %.0.i1241 = phi float [ %7090, %7089 ], [ %7092, %7091 ], !dbg !321 + %7093 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1243 = icmp eq i32 %7093, 0, !dbg !321 + br i1 %.not.i1243, label %7096, label %7094, !dbg !321 + +7094: ; preds = %__nv_exp2f.exit1242 + %7095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7057) #3, !dbg !321 + br label %__nv_exp2f.exit1245, !dbg !321 + +7096: ; preds = %__nv_exp2f.exit1242 + %7097 = tail call float @llvm.nvvm.ex2.approx.f(float %7057) #3, !dbg !321 + br label %__nv_exp2f.exit1245, !dbg !321 + +__nv_exp2f.exit1245: ; preds = %7094, %7096 + %.0.i1244 = phi float [ %7095, %7094 ], [ %7097, %7096 ], !dbg !321 + %7098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1246 = icmp eq i32 %7098, 0, !dbg !321 + br i1 %.not.i1246, label %7101, label %7099, !dbg !321 + +7099: ; preds = %__nv_exp2f.exit1245 + %7100 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7058) #3, !dbg !321 + br label %__nv_exp2f.exit1248, !dbg !321 + +7101: ; preds = %__nv_exp2f.exit1245 + %7102 = tail call float @llvm.nvvm.ex2.approx.f(float %7058) #3, !dbg !321 + br label %__nv_exp2f.exit1248, !dbg !321 + +__nv_exp2f.exit1248: ; preds = %7099, %7101 + %.0.i1247 = phi float [ %7100, %7099 ], [ %7102, %7101 ], !dbg !321 + %7103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1249 = icmp eq i32 %7103, 0, !dbg !321 + br i1 %.not.i1249, label %7106, label %7104, !dbg !321 + +7104: ; preds = %__nv_exp2f.exit1248 + %7105 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7059) #3, !dbg !321 + br label %__nv_exp2f.exit1251, !dbg !321 + +7106: ; preds = %__nv_exp2f.exit1248 + %7107 = tail call float @llvm.nvvm.ex2.approx.f(float %7059) #3, !dbg !321 + br label %__nv_exp2f.exit1251, !dbg !321 + +__nv_exp2f.exit1251: ; preds = %7104, %7106 + %.0.i1250 = phi float [ %7105, %7104 ], [ %7107, %7106 ], !dbg !321 + %7108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1252 = icmp eq i32 %7108, 0, !dbg !321 + br i1 %.not.i1252, label %7111, label %7109, !dbg !321 + +7109: ; preds = %__nv_exp2f.exit1251 + %7110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7060) #3, !dbg !321 + br label %__nv_exp2f.exit1254, !dbg !321 + +7111: ; preds = %__nv_exp2f.exit1251 + %7112 = tail call float @llvm.nvvm.ex2.approx.f(float %7060) #3, !dbg !321 + br label %__nv_exp2f.exit1254, !dbg !321 + +__nv_exp2f.exit1254: ; preds = %7109, %7111 + %.0.i1253 = phi float [ %7110, %7109 ], [ %7112, %7111 ], !dbg !321 + %7113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1255 = icmp eq i32 %7113, 0, !dbg !321 + br i1 %.not.i1255, label %7116, label %7114, !dbg !321 + +7114: ; preds = %__nv_exp2f.exit1254 + %7115 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7061) #3, !dbg !321 + br label %__nv_exp2f.exit1257, !dbg !321 + +7116: ; preds = %__nv_exp2f.exit1254 + %7117 = tail call float @llvm.nvvm.ex2.approx.f(float %7061) #3, !dbg !321 + br label %__nv_exp2f.exit1257, !dbg !321 + +__nv_exp2f.exit1257: ; preds = %7114, %7116 + %.0.i1256 = phi float [ %7115, %7114 ], [ %7117, %7116 ], !dbg !321 + %7118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1258 = icmp eq i32 %7118, 0, !dbg !321 + br i1 %.not.i1258, label %7121, label %7119, !dbg !321 + +7119: ; preds = %__nv_exp2f.exit1257 + %7120 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7062) #3, !dbg !321 + br label %__nv_exp2f.exit1260, !dbg !321 + +7121: ; preds = %__nv_exp2f.exit1257 + %7122 = tail call float @llvm.nvvm.ex2.approx.f(float %7062) #3, !dbg !321 + br label %__nv_exp2f.exit1260, !dbg !321 + +__nv_exp2f.exit1260: ; preds = %7119, %7121 + %.0.i1259 = phi float [ %7120, %7119 ], [ %7122, %7121 ], !dbg !321 + %7123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1261 = icmp eq i32 %7123, 0, !dbg !321 + br i1 %.not.i1261, label %7126, label %7124, !dbg !321 + +7124: ; preds = %__nv_exp2f.exit1260 + %7125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7063) #3, !dbg !321 + br label %__nv_exp2f.exit1263, !dbg !321 + +7126: ; preds = %__nv_exp2f.exit1260 + %7127 = tail call float @llvm.nvvm.ex2.approx.f(float %7063) #3, !dbg !321 + br label %__nv_exp2f.exit1263, !dbg !321 + +__nv_exp2f.exit1263: ; preds = %7124, %7126 + %.0.i1262 = phi float [ %7125, %7124 ], [ %7127, %7126 ], !dbg !321 + %7128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1264 = icmp eq i32 %7128, 0, !dbg !321 + br i1 %.not.i1264, label %7131, label %7129, !dbg !321 + +7129: ; preds = %__nv_exp2f.exit1263 + %7130 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7064) #3, !dbg !321 + br label %__nv_exp2f.exit1266, !dbg !321 + +7131: ; preds = %__nv_exp2f.exit1263 + %7132 = tail call float @llvm.nvvm.ex2.approx.f(float %7064) #3, !dbg !321 + br label %__nv_exp2f.exit1266, !dbg !321 + +__nv_exp2f.exit1266: ; preds = %7129, %7131 + %.0.i1265 = phi float [ %7130, %7129 ], [ %7132, %7131 ], !dbg !321 + %7133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1267 = icmp eq i32 %7133, 0, !dbg !321 + br i1 %.not.i1267, label %7136, label %7134, !dbg !321 + +7134: ; preds = %__nv_exp2f.exit1266 + %7135 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7065) #3, !dbg !321 + br label %__nv_exp2f.exit1269, !dbg !321 + +7136: ; preds = %__nv_exp2f.exit1266 + %7137 = tail call float @llvm.nvvm.ex2.approx.f(float %7065) #3, !dbg !321 + br label %__nv_exp2f.exit1269, !dbg !321 + +__nv_exp2f.exit1269: ; preds = %7134, %7136 + %.0.i1268 = phi float [ %7135, %7134 ], [ %7137, %7136 ], !dbg !321 + %7138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1270 = icmp eq i32 %7138, 0, !dbg !321 + br i1 %.not.i1270, label %7141, label %7139, !dbg !321 + +7139: ; preds = %__nv_exp2f.exit1269 + %7140 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7066) #3, !dbg !321 + br label %__nv_exp2f.exit1272, !dbg !321 + +7141: ; preds = %__nv_exp2f.exit1269 + %7142 = tail call float @llvm.nvvm.ex2.approx.f(float %7066) #3, !dbg !321 + br label %__nv_exp2f.exit1272, !dbg !321 + +__nv_exp2f.exit1272: ; preds = %7139, %7141 + %.0.i1271 = phi float [ %7140, %7139 ], [ %7142, %7141 ], !dbg !321 + %7143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1273 = icmp eq i32 %7143, 0, !dbg !321 + br i1 %.not.i1273, label %7146, label %7144, !dbg !321 + +7144: ; preds = %__nv_exp2f.exit1272 + %7145 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7067) #3, !dbg !321 + br label %__nv_exp2f.exit1275, !dbg !321 + +7146: ; preds = %__nv_exp2f.exit1272 + %7147 = tail call float @llvm.nvvm.ex2.approx.f(float %7067) #3, !dbg !321 + br label %__nv_exp2f.exit1275, !dbg !321 + +__nv_exp2f.exit1275: ; preds = %7144, %7146 + %.0.i1274 = phi float [ %7145, %7144 ], [ %7147, %7146 ], !dbg !321 + %7148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1276 = icmp eq i32 %7148, 0, !dbg !321 + br i1 %.not.i1276, label %7151, label %7149, !dbg !321 + +7149: ; preds = %__nv_exp2f.exit1275 + %7150 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7068) #3, !dbg !321 + br label %__nv_exp2f.exit1278, !dbg !321 + +7151: ; preds = %__nv_exp2f.exit1275 + %7152 = tail call float @llvm.nvvm.ex2.approx.f(float %7068) #3, !dbg !321 + br label %__nv_exp2f.exit1278, !dbg !321 + +__nv_exp2f.exit1278: ; preds = %7149, %7151 + %.0.i1277 = phi float [ %7150, %7149 ], [ %7152, %7151 ], !dbg !321 + %7153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1279 = icmp eq i32 %7153, 0, !dbg !321 + br i1 %.not.i1279, label %7156, label %7154, !dbg !321 + +7154: ; preds = %__nv_exp2f.exit1278 + %7155 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7069) #3, !dbg !321 + br label %__nv_exp2f.exit1281, !dbg !321 + +7156: ; preds = %__nv_exp2f.exit1278 + %7157 = tail call float @llvm.nvvm.ex2.approx.f(float %7069) #3, !dbg !321 + br label %__nv_exp2f.exit1281, !dbg !321 + +__nv_exp2f.exit1281: ; preds = %7154, %7156 + %.0.i1280 = phi float [ %7155, %7154 ], [ %7157, %7156 ], !dbg !321 + %7158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1282 = icmp eq i32 %7158, 0, !dbg !321 + br i1 %.not.i1282, label %7161, label %7159, !dbg !321 + +7159: ; preds = %__nv_exp2f.exit1281 + %7160 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7070) #3, !dbg !321 + br label %__nv_exp2f.exit1284, !dbg !321 + +7161: ; preds = %__nv_exp2f.exit1281 + %7162 = tail call float @llvm.nvvm.ex2.approx.f(float %7070) #3, !dbg !321 + br label %__nv_exp2f.exit1284, !dbg !321 + +__nv_exp2f.exit1284: ; preds = %7159, %7161 + %.0.i1283 = phi float [ %7160, %7159 ], [ %7162, %7161 ], !dbg !321 + %7163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1285 = icmp eq i32 %7163, 0, !dbg !321 + br i1 %.not.i1285, label %7166, label %7164, !dbg !321 + +7164: ; preds = %__nv_exp2f.exit1284 + %7165 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7071) #3, !dbg !321 + br label %__nv_exp2f.exit1287, !dbg !321 + +7166: ; preds = %__nv_exp2f.exit1284 + %7167 = tail call float @llvm.nvvm.ex2.approx.f(float %7071) #3, !dbg !321 + br label %__nv_exp2f.exit1287, !dbg !321 + +__nv_exp2f.exit1287: ; preds = %7164, %7166 + %.0.i1286 = phi float [ %7165, %7164 ], [ %7167, %7166 ], !dbg !321 + %7168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1288 = icmp eq i32 %7168, 0, !dbg !321 + br i1 %.not.i1288, label %7171, label %7169, !dbg !321 + +7169: ; preds = %__nv_exp2f.exit1287 + %7170 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7072) #3, !dbg !321 + br label %__nv_exp2f.exit1290, !dbg !321 + +7171: ; preds = %__nv_exp2f.exit1287 + %7172 = tail call float @llvm.nvvm.ex2.approx.f(float %7072) #3, !dbg !321 + br label %__nv_exp2f.exit1290, !dbg !321 + +__nv_exp2f.exit1290: ; preds = %7169, %7171 + %.0.i1289 = phi float [ %7170, %7169 ], [ %7172, %7171 ], !dbg !321 + %7173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1291 = icmp eq i32 %7173, 0, !dbg !321 + br i1 %.not.i1291, label %7176, label %7174, !dbg !321 + +7174: ; preds = %__nv_exp2f.exit1290 + %7175 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7073) #3, !dbg !321 + br label %__nv_exp2f.exit1293, !dbg !321 + +7176: ; preds = %__nv_exp2f.exit1290 + %7177 = tail call float @llvm.nvvm.ex2.approx.f(float %7073) #3, !dbg !321 + br label %__nv_exp2f.exit1293, !dbg !321 + +__nv_exp2f.exit1293: ; preds = %7174, %7176 + %.0.i1292 = phi float [ %7175, %7174 ], [ %7177, %7176 ], !dbg !321 + %7178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1294 = icmp eq i32 %7178, 0, !dbg !321 + br i1 %.not.i1294, label %7181, label %7179, !dbg !321 + +7179: ; preds = %__nv_exp2f.exit1293 + %7180 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7074) #3, !dbg !321 + br label %__nv_exp2f.exit1296, !dbg !321 + +7181: ; preds = %__nv_exp2f.exit1293 + %7182 = tail call float @llvm.nvvm.ex2.approx.f(float %7074) #3, !dbg !321 + br label %__nv_exp2f.exit1296, !dbg !321 + +__nv_exp2f.exit1296: ; preds = %7179, %7181 + %.0.i1295 = phi float [ %7180, %7179 ], [ %7182, %7181 ], !dbg !321 + %7183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1297 = icmp eq i32 %7183, 0, !dbg !321 + br i1 %.not.i1297, label %7186, label %7184, !dbg !321 + +7184: ; preds = %__nv_exp2f.exit1296 + %7185 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7075) #3, !dbg !321 + br label %__nv_exp2f.exit1299, !dbg !321 + +7186: ; preds = %__nv_exp2f.exit1296 + %7187 = tail call float @llvm.nvvm.ex2.approx.f(float %7075) #3, !dbg !321 + br label %__nv_exp2f.exit1299, !dbg !321 + +__nv_exp2f.exit1299: ; preds = %7184, %7186 + %.0.i1298 = phi float [ %7185, %7184 ], [ %7187, %7186 ], !dbg !321 + %7188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1300 = icmp eq i32 %7188, 0, !dbg !321 + br i1 %.not.i1300, label %7191, label %7189, !dbg !321 + +7189: ; preds = %__nv_exp2f.exit1299 + %7190 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7076) #3, !dbg !321 + br label %__nv_exp2f.exit1302, !dbg !321 + +7191: ; preds = %__nv_exp2f.exit1299 + %7192 = tail call float @llvm.nvvm.ex2.approx.f(float %7076) #3, !dbg !321 + br label %__nv_exp2f.exit1302, !dbg !321 + +__nv_exp2f.exit1302: ; preds = %7189, %7191 + %.0.i1301 = phi float [ %7190, %7189 ], [ %7192, %7191 ], !dbg !321 + %7193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1303 = icmp eq i32 %7193, 0, !dbg !321 + br i1 %.not.i1303, label %7196, label %7194, !dbg !321 + +7194: ; preds = %__nv_exp2f.exit1302 + %7195 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7077) #3, !dbg !321 + br label %__nv_exp2f.exit1305, !dbg !321 + +7196: ; preds = %__nv_exp2f.exit1302 + %7197 = tail call float @llvm.nvvm.ex2.approx.f(float %7077) #3, !dbg !321 + br label %__nv_exp2f.exit1305, !dbg !321 + +__nv_exp2f.exit1305: ; preds = %7194, %7196 + %.0.i1304 = phi float [ %7195, %7194 ], [ %7197, %7196 ], !dbg !321 + %7198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1306 = icmp eq i32 %7198, 0, !dbg !321 + br i1 %.not.i1306, label %7201, label %7199, !dbg !321 + +7199: ; preds = %__nv_exp2f.exit1305 + %7200 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7078) #3, !dbg !321 + br label %__nv_exp2f.exit1308, !dbg !321 + +7201: ; preds = %__nv_exp2f.exit1305 + %7202 = tail call float @llvm.nvvm.ex2.approx.f(float %7078) #3, !dbg !321 + br label %__nv_exp2f.exit1308, !dbg !321 + +__nv_exp2f.exit1308: ; preds = %7199, %7201 + %.0.i1307 = phi float [ %7200, %7199 ], [ %7202, %7201 ], !dbg !321 + %7203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1309 = icmp eq i32 %7203, 0, !dbg !321 + br i1 %.not.i1309, label %7206, label %7204, !dbg !321 + +7204: ; preds = %__nv_exp2f.exit1308 + %7205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7079) #3, !dbg !321 + br label %__nv_exp2f.exit1311, !dbg !321 + +7206: ; preds = %__nv_exp2f.exit1308 + %7207 = tail call float @llvm.nvvm.ex2.approx.f(float %7079) #3, !dbg !321 + br label %__nv_exp2f.exit1311, !dbg !321 + +__nv_exp2f.exit1311: ; preds = %7204, %7206 + %.0.i1310 = phi float [ %7205, %7204 ], [ %7207, %7206 ], !dbg !321 + %7208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1312 = icmp eq i32 %7208, 0, !dbg !321 + br i1 %.not.i1312, label %7211, label %7209, !dbg !321 + +7209: ; preds = %__nv_exp2f.exit1311 + %7210 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7080) #3, !dbg !321 + br label %__nv_exp2f.exit1314, !dbg !321 + +7211: ; preds = %__nv_exp2f.exit1311 + %7212 = tail call float @llvm.nvvm.ex2.approx.f(float %7080) #3, !dbg !321 + br label %__nv_exp2f.exit1314, !dbg !321 + +__nv_exp2f.exit1314: ; preds = %7209, %7211 + %.0.i1313 = phi float [ %7210, %7209 ], [ %7212, %7211 ], !dbg !321 + %7213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1315 = icmp eq i32 %7213, 0, !dbg !321 + br i1 %.not.i1315, label %7216, label %7214, !dbg !321 + +7214: ; preds = %__nv_exp2f.exit1314 + %7215 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7081) #3, !dbg !321 + br label %__nv_exp2f.exit1317, !dbg !321 + +7216: ; preds = %__nv_exp2f.exit1314 + %7217 = tail call float @llvm.nvvm.ex2.approx.f(float %7081) #3, !dbg !321 + br label %__nv_exp2f.exit1317, !dbg !321 + +__nv_exp2f.exit1317: ; preds = %7214, %7216 + %.0.i1316 = phi float [ %7215, %7214 ], [ %7217, %7216 ], !dbg !321 + %7218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1318 = icmp eq i32 %7218, 0, !dbg !321 + br i1 %.not.i1318, label %7221, label %7219, !dbg !321 + +7219: ; preds = %__nv_exp2f.exit1317 + %7220 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7082) #3, !dbg !321 + br label %__nv_exp2f.exit1320, !dbg !321 + +7221: ; preds = %__nv_exp2f.exit1317 + %7222 = tail call float @llvm.nvvm.ex2.approx.f(float %7082) #3, !dbg !321 + br label %__nv_exp2f.exit1320, !dbg !321 + +__nv_exp2f.exit1320: ; preds = %7219, %7221 + %.0.i1319 = phi float [ %7220, %7219 ], [ %7222, %7221 ], !dbg !321 + %7223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1321 = icmp eq i32 %7223, 0, !dbg !321 + br i1 %.not.i1321, label %7226, label %7224, !dbg !321 + +7224: ; preds = %__nv_exp2f.exit1320 + %7225 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7083) #3, !dbg !321 + br label %__nv_exp2f.exit1323, !dbg !321 + +7226: ; preds = %__nv_exp2f.exit1320 + %7227 = tail call float @llvm.nvvm.ex2.approx.f(float %7083) #3, !dbg !321 + br label %__nv_exp2f.exit1323, !dbg !321 + +__nv_exp2f.exit1323: ; preds = %7224, %7226 + %.0.i1322 = phi float [ %7225, %7224 ], [ %7227, %7226 ], !dbg !321 + %7228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1324 = icmp eq i32 %7228, 0, !dbg !321 + br i1 %.not.i1324, label %7231, label %7229, !dbg !321 + +7229: ; preds = %__nv_exp2f.exit1323 + %7230 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7084) #3, !dbg !321 + br label %__nv_exp2f.exit1326, !dbg !321 + +7231: ; preds = %__nv_exp2f.exit1323 + %7232 = tail call float @llvm.nvvm.ex2.approx.f(float %7084) #3, !dbg !321 + br label %__nv_exp2f.exit1326, !dbg !321 + +__nv_exp2f.exit1326: ; preds = %7229, %7231 + %.0.i1325 = phi float [ %7230, %7229 ], [ %7232, %7231 ], !dbg !321 + %7233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1327 = icmp eq i32 %7233, 0, !dbg !321 + br i1 %.not.i1327, label %7236, label %7234, !dbg !321 + +7234: ; preds = %__nv_exp2f.exit1326 + %7235 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7085) #3, !dbg !321 + br label %__nv_exp2f.exit1329, !dbg !321 + +7236: ; preds = %__nv_exp2f.exit1326 + %7237 = tail call float @llvm.nvvm.ex2.approx.f(float %7085) #3, !dbg !321 + br label %__nv_exp2f.exit1329, !dbg !321 + +__nv_exp2f.exit1329: ; preds = %7234, %7236 + %.0.i1328 = phi float [ %7235, %7234 ], [ %7237, %7236 ], !dbg !321 + %7238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1330 = icmp eq i32 %7238, 0, !dbg !321 + br i1 %.not.i1330, label %7241, label %7239, !dbg !321 + +7239: ; preds = %__nv_exp2f.exit1329 + %7240 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7086) #3, !dbg !321 + br label %__nv_exp2f.exit1332, !dbg !321 + +7241: ; preds = %__nv_exp2f.exit1329 + %7242 = tail call float @llvm.nvvm.ex2.approx.f(float %7086) #3, !dbg !321 + br label %__nv_exp2f.exit1332, !dbg !321 + +__nv_exp2f.exit1332: ; preds = %7239, %7241 + %.0.i1331 = phi float [ %7240, %7239 ], [ %7242, %7241 ], !dbg !321 + %7243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1333 = icmp eq i32 %7243, 0, !dbg !321 + br i1 %.not.i1333, label %7246, label %7244, !dbg !321 + +7244: ; preds = %__nv_exp2f.exit1332 + %7245 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7087) #3, !dbg !321 + br label %__nv_exp2f.exit1335, !dbg !321 + +7246: ; preds = %__nv_exp2f.exit1332 + %7247 = tail call float @llvm.nvvm.ex2.approx.f(float %7087) #3, !dbg !321 + br label %__nv_exp2f.exit1335, !dbg !321 + +__nv_exp2f.exit1335: ; preds = %7244, %7246 + %.0.i1334 = phi float [ %7245, %7244 ], [ %7247, %7246 ], !dbg !321 + %7248 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %6180, !dbg !296 + %7249 = insertelement <2 x float> poison, float %.0.i1241, i64 0, !dbg !322 + %7250 = insertelement <2 x float> %7249, float %.0.i1244, i64 1, !dbg !322 + %7251 = fptrunc <2 x float> %7250 to <2 x bfloat>, !dbg !322 + %7252 = insertelement <2 x float> poison, float %.0.i1247, i64 0, !dbg !322 + %7253 = insertelement <2 x float> %7252, float %.0.i1250, i64 1, !dbg !322 + %7254 = fptrunc <2 x float> %7253 to <2 x bfloat>, !dbg !322 + %7255 = insertelement <2 x float> poison, float %.0.i1253, i64 0, !dbg !322 + %7256 = insertelement <2 x float> %7255, float %.0.i1256, i64 1, !dbg !322 + %7257 = fptrunc <2 x float> %7256 to <2 x bfloat>, !dbg !322 + %7258 = insertelement <2 x float> poison, float %.0.i1259, i64 0, !dbg !322 + %7259 = insertelement <2 x float> %7258, float %.0.i1262, i64 1, !dbg !322 + %7260 = fptrunc <2 x float> %7259 to <2 x bfloat>, !dbg !322 + %7261 = insertelement <2 x float> poison, float %.0.i1265, i64 0, !dbg !322 + %7262 = insertelement <2 x float> %7261, float %.0.i1268, i64 1, !dbg !322 + %7263 = fptrunc <2 x float> %7262 to <2 x bfloat>, !dbg !322 + %7264 = insertelement <2 x float> poison, float %.0.i1271, i64 0, !dbg !322 + %7265 = insertelement <2 x float> %7264, float %.0.i1274, i64 1, !dbg !322 + %7266 = fptrunc <2 x float> %7265 to <2 x bfloat>, !dbg !322 + %7267 = insertelement <2 x float> poison, float %.0.i1277, i64 0, !dbg !322 + %7268 = insertelement <2 x float> %7267, float %.0.i1280, i64 1, !dbg !322 + %7269 = fptrunc <2 x float> %7268 to <2 x bfloat>, !dbg !322 + %7270 = insertelement <2 x float> poison, float %.0.i1283, i64 0, !dbg !322 + %7271 = insertelement <2 x float> %7270, float %.0.i1286, i64 1, !dbg !322 + %7272 = fptrunc <2 x float> %7271 to <2 x bfloat>, !dbg !322 + %7273 = insertelement <2 x float> poison, float %.0.i1289, i64 0, !dbg !322 + %7274 = insertelement <2 x float> %7273, float %.0.i1292, i64 1, !dbg !322 + %7275 = fptrunc <2 x float> %7274 to <2 x bfloat>, !dbg !322 + %7276 = insertelement <2 x float> poison, float %.0.i1295, i64 0, !dbg !322 + %7277 = insertelement <2 x float> %7276, float %.0.i1298, i64 1, !dbg !322 + %7278 = fptrunc <2 x float> %7277 to <2 x bfloat>, !dbg !322 + %7279 = insertelement <2 x float> poison, float %.0.i1301, i64 0, !dbg !322 + %7280 = insertelement <2 x float> %7279, float %.0.i1304, i64 1, !dbg !322 + %7281 = fptrunc <2 x float> %7280 to <2 x bfloat>, !dbg !322 + %7282 = insertelement <2 x float> poison, float %.0.i1307, i64 0, !dbg !322 + %7283 = insertelement <2 x float> %7282, float %.0.i1310, i64 1, !dbg !322 + %7284 = fptrunc <2 x float> %7283 to <2 x bfloat>, !dbg !322 + %7285 = insertelement <2 x float> poison, float %.0.i1313, i64 0, !dbg !322 + %7286 = insertelement <2 x float> %7285, float %.0.i1316, i64 1, !dbg !322 + %7287 = fptrunc <2 x float> %7286 to <2 x bfloat>, !dbg !322 + %7288 = insertelement <2 x float> poison, float %.0.i1319, i64 0, !dbg !322 + %7289 = insertelement <2 x float> %7288, float %.0.i1322, i64 1, !dbg !322 + %7290 = fptrunc <2 x float> %7289 to <2 x bfloat>, !dbg !322 + %7291 = insertelement <2 x float> poison, float %.0.i1325, i64 0, !dbg !322 + %7292 = insertelement <2 x float> %7291, float %.0.i1328, i64 1, !dbg !322 + %7293 = fptrunc <2 x float> %7292 to <2 x bfloat>, !dbg !322 + %7294 = insertelement <2 x float> poison, float %.0.i1331, i64 0, !dbg !322 + %7295 = insertelement <2 x float> %7294, float %.0.i1334, i64 1, !dbg !322 + %7296 = fptrunc <2 x float> %7295 to <2 x bfloat>, !dbg !322 + %7297 = bitcast <2 x bfloat> %7251 to i32, !dbg !323 + %7298 = bitcast <2 x bfloat> %7254 to i32, !dbg !323 + %7299 = bitcast <2 x bfloat> %7257 to i32, !dbg !323 + %7300 = bitcast <2 x bfloat> %7260 to i32, !dbg !323 + %7301 = bitcast <2 x bfloat> %7263 to i32, !dbg !323 + %7302 = bitcast <2 x bfloat> %7266 to i32, !dbg !323 + %7303 = bitcast <2 x bfloat> %7269 to i32, !dbg !323 + %7304 = bitcast <2 x bfloat> %7272 to i32, !dbg !323 + %7305 = bitcast <2 x bfloat> %7275 to i32, !dbg !323 + %7306 = bitcast <2 x bfloat> %7278 to i32, !dbg !323 + %7307 = bitcast <2 x bfloat> %7281 to i32, !dbg !323 + %7308 = bitcast <2 x bfloat> %7284 to i32, !dbg !323 + %7309 = bitcast <2 x bfloat> %7287 to i32, !dbg !323 + %7310 = bitcast <2 x bfloat> %7290 to i32, !dbg !323 + %7311 = bitcast <2 x bfloat> %7293 to i32, !dbg !323 + %7312 = bitcast <2 x bfloat> %7296 to i32, !dbg !323 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !323 + %7313 = ptrtoint ptr addrspace(3) %7248 to i32, !dbg !323 + %7314 = lshr exact i32 %7313, 4, !dbg !323 + %7315 = and i32 %7314, 16383, !dbg !323 + %7316 = zext nneg i32 %7315 to i64, !dbg !323 + %7317 = or disjoint i64 %7316, 4611686293338849280, !dbg !323 + %7318 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6023, float %6024, float %6025, float %6026, float %6027, float %6028, float %6029, float %6030, float %6031, float %6032, float %6033, float %6034, float %6035, float %6036, float %6037, float %6038, float %6039, float %6040, float %6041, float %6042, float %6043, float %6044, float %6045, float %6046, float %6047, float %6048, float %6049, float %6050, float %6051, float %6052, float %6053, float %6054, float %6055, float %6056, float %6057, float %6058, float %6059, float %6060, float %6061, float %6062, float %6063, float %6064, float %6065, float %6066, float %6067, float %6068, float %6069, float %6070, float %6071, float %6072, float %6073, float %6074, float %6075, float %6076, float %6077, float %6078, float %6079, float %6080, float %6081, float %6082, float %6083, float %6084, float %6085, float %6086, i32 %7297, i32 %7298, i32 %7299, i32 %7300, i64 %7317, i1 true) #3, !dbg !323 + %7319 = add i32 %7313, 2048, !dbg !323 + %7320 = lshr exact i32 %7319, 4, !dbg !323 + %7321 = and i32 %7320, 16383, !dbg !323 + %7322 = zext nneg i32 %7321 to i64, !dbg !323 + %7323 = or disjoint i64 %7322, 4611686293338849280, !dbg !323 + %7324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 0, !dbg !323 + %7325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 1, !dbg !323 + %7326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 2, !dbg !323 + %7327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 3, !dbg !323 + %7328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 4, !dbg !323 + %7329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 5, !dbg !323 + %7330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 6, !dbg !323 + %7331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 7, !dbg !323 + %7332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 8, !dbg !323 + %7333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 9, !dbg !323 + %7334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 10, !dbg !323 + %7335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 11, !dbg !323 + %7336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 12, !dbg !323 + %7337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 13, !dbg !323 + %7338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 14, !dbg !323 + %7339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 15, !dbg !323 + %7340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 16, !dbg !323 + %7341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 17, !dbg !323 + %7342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 18, !dbg !323 + %7343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 19, !dbg !323 + %7344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 20, !dbg !323 + %7345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 21, !dbg !323 + %7346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 22, !dbg !323 + %7347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 23, !dbg !323 + %7348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 24, !dbg !323 + %7349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 25, !dbg !323 + %7350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 26, !dbg !323 + %7351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 27, !dbg !323 + %7352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 28, !dbg !323 + %7353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 29, !dbg !323 + %7354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 30, !dbg !323 + %7355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 31, !dbg !323 + %7356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 32, !dbg !323 + %7357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 33, !dbg !323 + %7358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 34, !dbg !323 + %7359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 35, !dbg !323 + %7360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 36, !dbg !323 + %7361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 37, !dbg !323 + %7362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 38, !dbg !323 + %7363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 39, !dbg !323 + %7364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 40, !dbg !323 + %7365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 41, !dbg !323 + %7366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 42, !dbg !323 + %7367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 43, !dbg !323 + %7368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 44, !dbg !323 + %7369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 45, !dbg !323 + %7370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 46, !dbg !323 + %7371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 47, !dbg !323 + %7372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 48, !dbg !323 + %7373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 49, !dbg !323 + %7374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 50, !dbg !323 + %7375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 51, !dbg !323 + %7376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 52, !dbg !323 + %7377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 53, !dbg !323 + %7378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 54, !dbg !323 + %7379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 55, !dbg !323 + %7380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 56, !dbg !323 + %7381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 57, !dbg !323 + %7382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 58, !dbg !323 + %7383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 59, !dbg !323 + %7384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 60, !dbg !323 + %7385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 61, !dbg !323 + %7386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 62, !dbg !323 + %7387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 63, !dbg !323 + %7388 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7324, float %7325, float %7326, float %7327, float %7328, float %7329, float %7330, float %7331, float %7332, float %7333, float %7334, float %7335, float %7336, float %7337, float %7338, float %7339, float %7340, float %7341, float %7342, float %7343, float %7344, float %7345, float %7346, float %7347, float %7348, float %7349, float %7350, float %7351, float %7352, float %7353, float %7354, float %7355, float %7356, float %7357, float %7358, float %7359, float %7360, float %7361, float %7362, float %7363, float %7364, float %7365, float %7366, float %7367, float %7368, float %7369, float %7370, float %7371, float %7372, float %7373, float %7374, float %7375, float %7376, float %7377, float %7378, float %7379, float %7380, float %7381, float %7382, float %7383, float %7384, float %7385, float %7386, float %7387, i32 %7301, i32 %7302, i32 %7303, i32 %7304, i64 %7323, i1 true) #3, !dbg !323 + %7389 = add i32 %7313, 4096, !dbg !323 + %7390 = lshr exact i32 %7389, 4, !dbg !323 + %7391 = and i32 %7390, 16383, !dbg !323 + %7392 = zext nneg i32 %7391 to i64, !dbg !323 + %7393 = or disjoint i64 %7392, 4611686293338849280, !dbg !323 + %7394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 0, !dbg !323 + %7395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 1, !dbg !323 + %7396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 2, !dbg !323 + %7397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 3, !dbg !323 + %7398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 4, !dbg !323 + %7399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 5, !dbg !323 + %7400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 6, !dbg !323 + %7401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 7, !dbg !323 + %7402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 8, !dbg !323 + %7403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 9, !dbg !323 + %7404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 10, !dbg !323 + %7405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 11, !dbg !323 + %7406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 12, !dbg !323 + %7407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 13, !dbg !323 + %7408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 14, !dbg !323 + %7409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 15, !dbg !323 + %7410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 16, !dbg !323 + %7411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 17, !dbg !323 + %7412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 18, !dbg !323 + %7413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 19, !dbg !323 + %7414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 20, !dbg !323 + %7415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 21, !dbg !323 + %7416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 22, !dbg !323 + %7417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 23, !dbg !323 + %7418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 24, !dbg !323 + %7419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 25, !dbg !323 + %7420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 26, !dbg !323 + %7421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 27, !dbg !323 + %7422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 28, !dbg !323 + %7423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 29, !dbg !323 + %7424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 30, !dbg !323 + %7425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 31, !dbg !323 + %7426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 32, !dbg !323 + %7427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 33, !dbg !323 + %7428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 34, !dbg !323 + %7429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 35, !dbg !323 + %7430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 36, !dbg !323 + %7431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 37, !dbg !323 + %7432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 38, !dbg !323 + %7433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 39, !dbg !323 + %7434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 40, !dbg !323 + %7435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 41, !dbg !323 + %7436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 42, !dbg !323 + %7437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 43, !dbg !323 + %7438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 44, !dbg !323 + %7439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 45, !dbg !323 + %7440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 46, !dbg !323 + %7441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 47, !dbg !323 + %7442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 48, !dbg !323 + %7443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 49, !dbg !323 + %7444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 50, !dbg !323 + %7445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 51, !dbg !323 + %7446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 52, !dbg !323 + %7447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 53, !dbg !323 + %7448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 54, !dbg !323 + %7449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 55, !dbg !323 + %7450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 56, !dbg !323 + %7451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 57, !dbg !323 + %7452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 58, !dbg !323 + %7453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 59, !dbg !323 + %7454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 60, !dbg !323 + %7455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 61, !dbg !323 + %7456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 62, !dbg !323 + %7457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 63, !dbg !323 + %7458 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7394, float %7395, float %7396, float %7397, float %7398, float %7399, float %7400, float %7401, float %7402, float %7403, float %7404, float %7405, float %7406, float %7407, float %7408, float %7409, float %7410, float %7411, float %7412, float %7413, float %7414, float %7415, float %7416, float %7417, float %7418, float %7419, float %7420, float %7421, float %7422, float %7423, float %7424, float %7425, float %7426, float %7427, float %7428, float %7429, float %7430, float %7431, float %7432, float %7433, float %7434, float %7435, float %7436, float %7437, float %7438, float %7439, float %7440, float %7441, float %7442, float %7443, float %7444, float %7445, float %7446, float %7447, float %7448, float %7449, float %7450, float %7451, float %7452, float %7453, float %7454, float %7455, float %7456, float %7457, i32 %7305, i32 %7306, i32 %7307, i32 %7308, i64 %7393, i1 true) #3, !dbg !323 + %7459 = add i32 %7313, 6144, !dbg !323 + %7460 = lshr exact i32 %7459, 4, !dbg !323 + %7461 = and i32 %7460, 16383, !dbg !323 + %7462 = zext nneg i32 %7461 to i64, !dbg !323 + %7463 = or disjoint i64 %7462, 4611686293338849280, !dbg !323 + %7464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 0, !dbg !323 + %7465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 1, !dbg !323 + %7466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 2, !dbg !323 + %7467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 3, !dbg !323 + %7468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 4, !dbg !323 + %7469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 5, !dbg !323 + %7470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 6, !dbg !323 + %7471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 7, !dbg !323 + %7472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 8, !dbg !323 + %7473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 9, !dbg !323 + %7474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 10, !dbg !323 + %7475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 11, !dbg !323 + %7476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 12, !dbg !323 + %7477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 13, !dbg !323 + %7478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 14, !dbg !323 + %7479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 15, !dbg !323 + %7480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 16, !dbg !323 + %7481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 17, !dbg !323 + %7482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 18, !dbg !323 + %7483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 19, !dbg !323 + %7484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 20, !dbg !323 + %7485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 21, !dbg !323 + %7486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 22, !dbg !323 + %7487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 23, !dbg !323 + %7488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 24, !dbg !323 + %7489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 25, !dbg !323 + %7490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 26, !dbg !323 + %7491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 27, !dbg !323 + %7492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 28, !dbg !323 + %7493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 29, !dbg !323 + %7494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 30, !dbg !323 + %7495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 31, !dbg !323 + %7496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 32, !dbg !323 + %7497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 33, !dbg !323 + %7498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 34, !dbg !323 + %7499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 35, !dbg !323 + %7500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 36, !dbg !323 + %7501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 37, !dbg !323 + %7502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 38, !dbg !323 + %7503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 39, !dbg !323 + %7504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 40, !dbg !323 + %7505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 41, !dbg !323 + %7506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 42, !dbg !323 + %7507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 43, !dbg !323 + %7508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 44, !dbg !323 + %7509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 45, !dbg !323 + %7510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 46, !dbg !323 + %7511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 47, !dbg !323 + %7512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 48, !dbg !323 + %7513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 49, !dbg !323 + %7514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 50, !dbg !323 + %7515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 51, !dbg !323 + %7516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 52, !dbg !323 + %7517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 53, !dbg !323 + %7518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 54, !dbg !323 + %7519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 55, !dbg !323 + %7520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 56, !dbg !323 + %7521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 57, !dbg !323 + %7522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 58, !dbg !323 + %7523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 59, !dbg !323 + %7524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 60, !dbg !323 + %7525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 61, !dbg !323 + %7526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 62, !dbg !323 + %7527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 63, !dbg !323 + %7528 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7464, float %7465, float %7466, float %7467, float %7468, float %7469, float %7470, float %7471, float %7472, float %7473, float %7474, float %7475, float %7476, float %7477, float %7478, float %7479, float %7480, float %7481, float %7482, float %7483, float %7484, float %7485, float %7486, float %7487, float %7488, float %7489, float %7490, float %7491, float %7492, float %7493, float %7494, float %7495, float %7496, float %7497, float %7498, float %7499, float %7500, float %7501, float %7502, float %7503, float %7504, float %7505, float %7506, float %7507, float %7508, float %7509, float %7510, float %7511, float %7512, float %7513, float %7514, float %7515, float %7516, float %7517, float %7518, float %7519, float %7520, float %7521, float %7522, float %7523, float %7524, float %7525, float %7526, float %7527, i32 %7309, i32 %7310, i32 %7311, i32 %7312, i64 %7463, i1 true) #3, !dbg !323 + %7529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 0, !dbg !323 + %7530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 1, !dbg !323 + %7531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 2, !dbg !323 + %7532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 3, !dbg !323 + %7533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 4, !dbg !323 + %7534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 5, !dbg !323 + %7535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 6, !dbg !323 + %7536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 7, !dbg !323 + %7537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 8, !dbg !323 + %7538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 9, !dbg !323 + %7539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 10, !dbg !323 + %7540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 11, !dbg !323 + %7541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 12, !dbg !323 + %7542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 13, !dbg !323 + %7543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 14, !dbg !323 + %7544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 15, !dbg !323 + %7545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 16, !dbg !323 + %7546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 17, !dbg !323 + %7547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 18, !dbg !323 + %7548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 19, !dbg !323 + %7549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 20, !dbg !323 + %7550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 21, !dbg !323 + %7551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 22, !dbg !323 + %7552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 23, !dbg !323 + %7553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 24, !dbg !323 + %7554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 25, !dbg !323 + %7555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 26, !dbg !323 + %7556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 27, !dbg !323 + %7557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 28, !dbg !323 + %7558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 29, !dbg !323 + %7559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 30, !dbg !323 + %7560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 31, !dbg !323 + %7561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 32, !dbg !323 + %7562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 33, !dbg !323 + %7563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 34, !dbg !323 + %7564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 35, !dbg !323 + %7565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 36, !dbg !323 + %7566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 37, !dbg !323 + %7567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 38, !dbg !323 + %7568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 39, !dbg !323 + %7569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 40, !dbg !323 + %7570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 41, !dbg !323 + %7571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 42, !dbg !323 + %7572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 43, !dbg !323 + %7573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 44, !dbg !323 + %7574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 45, !dbg !323 + %7575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 46, !dbg !323 + %7576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 47, !dbg !323 + %7577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 48, !dbg !323 + %7578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 49, !dbg !323 + %7579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 50, !dbg !323 + %7580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 51, !dbg !323 + %7581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 52, !dbg !323 + %7582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 53, !dbg !323 + %7583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 54, !dbg !323 + %7584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 55, !dbg !323 + %7585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 56, !dbg !323 + %7586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 57, !dbg !323 + %7587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 58, !dbg !323 + %7588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 59, !dbg !323 + %7589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 60, !dbg !323 + %7590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 61, !dbg !323 + %7591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 62, !dbg !323 + %7592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 63, !dbg !323 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !323 + %7593 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %6182, !dbg !298 + %7594 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5332, !dbg !298 + %7595 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5336, !dbg !298 + %7596 = load float, ptr addrspace(3) %7595, align 8, !dbg !298 + %7597 = getelementptr inbounds nuw i8, ptr addrspace(3) %7595, i32 4, !dbg !298 + %7598 = load float, ptr addrspace(3) %7597, align 4, !dbg !298 + %7599 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5342, !dbg !298 + %7600 = load float, ptr addrspace(3) %7599, align 8, !dbg !298 + %7601 = getelementptr inbounds nuw i8, ptr addrspace(3) %7599, i32 4, !dbg !298 + %7602 = load float, ptr addrspace(3) %7601, align 4, !dbg !298 + %7603 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5348, !dbg !298 + %7604 = load float, ptr addrspace(3) %7603, align 8, !dbg !298 + %7605 = getelementptr inbounds nuw i8, ptr addrspace(3) %7603, i32 4, !dbg !298 + %7606 = load float, ptr addrspace(3) %7605, align 4, !dbg !298 + %7607 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5354, !dbg !298 + %7608 = load float, ptr addrspace(3) %7607, align 8, !dbg !298 + %7609 = getelementptr inbounds nuw i8, ptr addrspace(3) %7607, i32 4, !dbg !298 + %7610 = load float, ptr addrspace(3) %7609, align 4, !dbg !298 + %7611 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5360, !dbg !298 + %7612 = load float, ptr addrspace(3) %7611, align 8, !dbg !298 + %7613 = getelementptr inbounds nuw i8, ptr addrspace(3) %7611, i32 4, !dbg !298 + %7614 = load float, ptr addrspace(3) %7613, align 4, !dbg !298 + %7615 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5366, !dbg !298 + %7616 = load float, ptr addrspace(3) %7615, align 8, !dbg !298 + %7617 = getelementptr inbounds nuw i8, ptr addrspace(3) %7615, i32 4, !dbg !298 + %7618 = load float, ptr addrspace(3) %7617, align 4, !dbg !298 + %7619 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5372, !dbg !298 + %7620 = load float, ptr addrspace(3) %7619, align 8, !dbg !298 + %7621 = getelementptr inbounds nuw i8, ptr addrspace(3) %7619, i32 4, !dbg !298 + %7622 = load float, ptr addrspace(3) %7621, align 4, !dbg !298 + %7623 = add i32 %6250, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7624 = lshr exact i32 %7623, 4, !dbg !324 + %7625 = and i32 %7624, 16383, !dbg !324 + %7626 = zext nneg i32 %7625 to i64, !dbg !324 + %7627 = or disjoint i64 %7626, 4611686293372403712, !dbg !324 + %7628 = add i32 %6262, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7629 = lshr exact i32 %7628, 4, !dbg !324 + %7630 = and i32 %7629, 16383, !dbg !324 + %7631 = zext nneg i32 %7630 to i64, !dbg !324 + %7632 = or disjoint i64 %7631, 4611686293372403712, !dbg !324 + %7633 = add i32 %7313, 32, !dbg !324 + %7634 = lshr exact i32 %7633, 4, !dbg !324 + %7635 = and i32 %7634, 16383, !dbg !324 + %7636 = zext nneg i32 %7635 to i64, !dbg !324 + %7637 = or disjoint i64 %7636, 4611686293338849280, !dbg !324 + %7638 = add i32 %6306, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7639 = lshr exact i32 %7638, 4, !dbg !324 + %7640 = and i32 %7639, 16383, !dbg !324 + %7641 = zext nneg i32 %7640 to i64, !dbg !324 + %7642 = or disjoint i64 %7641, 4611686293372403712, !dbg !324 + %7643 = add i32 %7313, 64, !dbg !324 + %7644 = lshr exact i32 %7643, 4, !dbg !324 + %7645 = and i32 %7644, 16383, !dbg !324 + %7646 = zext nneg i32 %7645 to i64, !dbg !324 + %7647 = or disjoint i64 %7646, 4611686293338849280, !dbg !324 + %7648 = add i32 %6350, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7649 = lshr exact i32 %7648, 4, !dbg !324 + %7650 = and i32 %7649, 16383, !dbg !324 + %7651 = zext nneg i32 %7650 to i64, !dbg !324 + %7652 = or disjoint i64 %7651, 4611686293372403712, !dbg !324 + %7653 = add i32 %7313, 96, !dbg !324 + %7654 = lshr exact i32 %7653, 4, !dbg !324 + %7655 = and i32 %7654, 16383, !dbg !324 + %7656 = zext nneg i32 %7655 to i64, !dbg !324 + %7657 = or disjoint i64 %7656, 4611686293338849280, !dbg !324 + %7658 = add i32 %6394, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7659 = lshr exact i32 %7658, 4, !dbg !324 + %7660 = and i32 %7659, 16383, !dbg !324 + %7661 = zext nneg i32 %7660 to i64, !dbg !324 + %7662 = or disjoint i64 %7661, 4611686293372403712, !dbg !324 + %7663 = add i32 %7313, 8192, !dbg !324 + %7664 = lshr exact i32 %7663, 4, !dbg !324 + %7665 = and i32 %7664, 16383, !dbg !324 + %7666 = zext nneg i32 %7665 to i64, !dbg !324 + %7667 = or disjoint i64 %7666, 4611686293338849280, !dbg !324 + %7668 = add i32 %6438, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7669 = lshr exact i32 %7668, 4, !dbg !324 + %7670 = and i32 %7669, 16383, !dbg !324 + %7671 = zext nneg i32 %7670 to i64, !dbg !324 + %7672 = or disjoint i64 %7671, 4611686293372403712, !dbg !324 + %7673 = add i32 %7313, 8224, !dbg !324 + %7674 = lshr exact i32 %7673, 4, !dbg !324 + %7675 = and i32 %7674, 16383, !dbg !324 + %7676 = zext nneg i32 %7675 to i64, !dbg !324 + %7677 = or disjoint i64 %7676, 4611686293338849280, !dbg !324 + %7678 = add i32 %6482, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7679 = lshr exact i32 %7678, 4, !dbg !324 + %7680 = and i32 %7679, 16383, !dbg !324 + %7681 = zext nneg i32 %7680 to i64, !dbg !324 + %7682 = or disjoint i64 %7681, 4611686293372403712, !dbg !324 + %7683 = add i32 %7313, 8256, !dbg !324 + %7684 = lshr exact i32 %7683, 4, !dbg !324 + %7685 = and i32 %7684, 16383, !dbg !324 + %7686 = zext nneg i32 %7685 to i64, !dbg !324 + %7687 = or disjoint i64 %7686, 4611686293338849280, !dbg !324 + %7688 = add i32 %6526, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7689 = lshr exact i32 %7688, 4, !dbg !324 + %7690 = and i32 %7689, 16383, !dbg !324 + %7691 = zext nneg i32 %7690 to i64, !dbg !324 + %7692 = or disjoint i64 %7691, 4611686293372403712, !dbg !324 + %7693 = add i32 %7313, 8288, !dbg !324 + %7694 = lshr exact i32 %7693, 4, !dbg !324 + %7695 = and i32 %7694, 16383, !dbg !324 + %7696 = zext nneg i32 %7695 to i64, !dbg !324 + %7697 = or disjoint i64 %7696, 4611686293338849280, !dbg !324 + %7698 = add i32 %6256, 2048, !dbg !325 + %7699 = lshr exact i32 %7698, 4, !dbg !325 + %7700 = and i32 %7699, 16383, !dbg !325 + %7701 = zext nneg i32 %7700 to i64, !dbg !325 + %7702 = or disjoint i64 %7701, 4611686293338849280, !dbg !325 + %7703 = add i32 %6256, 4096, !dbg !325 + %7704 = lshr exact i32 %7703, 4, !dbg !325 + %7705 = and i32 %7704, 16383, !dbg !325 + %7706 = zext nneg i32 %7705 to i64, !dbg !325 + %7707 = or disjoint i64 %7706, 4611686293338849280, !dbg !325 + %7708 = add i32 %6256, 6144, !dbg !325 + %7709 = lshr exact i32 %7708, 4, !dbg !325 + %7710 = and i32 %7709, 16383, !dbg !325 + %7711 = zext nneg i32 %7710 to i64, !dbg !325 + %7712 = or disjoint i64 %7711, 4611686293338849280, !dbg !325 + %7713 = insertelement <2 x i32> poison, i32 %5996, i64 0, !dbg !326 + %7714 = shufflevector <2 x i32> %7713, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !326 + %7715 = add <2 x i32> %6152, %7714, !dbg !326 + %7716 = add i32 %5994, %5996, !dbg !326 + %7717 = add i32 %5995, %5996, !dbg !326 + %7718 = insertelement <4 x i32> poison, i32 %5996, i64 0, !dbg !326 + %7719 = shufflevector <4 x i32> %7718, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !326 + %7720 = add <4 x i32> %6156, %7719, !dbg !326 + %7721 = insertelement <8 x i32> poison, i32 %5996, i64 0, !dbg !326 + %7722 = shufflevector <8 x i32> %7721, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !326 + %7723 = add <8 x i32> %6155, %7722, !dbg !326 + %7724 = add nuw nsw i32 %6151, 1, !dbg !255 + %7725 = lshr i32 %7724, 1, !dbg !327 + %7726 = zext nneg i32 %7725 to i64, !dbg !328 + %7727 = getelementptr i32, ptr addrspace(1) %5128, i64 %7726, !dbg !328 + %7728 = add nuw nsw i32 %7725, 1, !dbg !329 + %7729 = icmp slt i32 %7728, %5133, !dbg !330 + %7730 = getelementptr i8, ptr addrspace(1) %7727, i64 4, !dbg !331 + %7731 = and i1 %6158, %7729, !dbg !255 + %7732 = and i32 %6151, 1, !dbg !332 + %7733 = xor i32 %7732, 1, !dbg !333 + %7734 = shl nuw nsw i32 %7732, 6, !dbg !334 + %7735 = load <2 x float>, ptr addrspace(3) %7594, align 8, !dbg !298 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !324 + %7736 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %7627, i64 %7317) #3, !dbg !324 + %7737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 0, !dbg !324 + %7738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 1, !dbg !324 + %7739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 2, !dbg !324 + %7740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 3, !dbg !324 + %7741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 4, !dbg !324 + %7742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 5, !dbg !324 + %7743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 6, !dbg !324 + %7744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 7, !dbg !324 + %7745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 8, !dbg !324 + %7746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 9, !dbg !324 + %7747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 10, !dbg !324 + %7748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 11, !dbg !324 + %7749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 12, !dbg !324 + %7750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 13, !dbg !324 + %7751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 14, !dbg !324 + %7752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 15, !dbg !324 + %7753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 16, !dbg !324 + %7754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 17, !dbg !324 + %7755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 18, !dbg !324 + %7756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 19, !dbg !324 + %7757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 20, !dbg !324 + %7758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 21, !dbg !324 + %7759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 22, !dbg !324 + %7760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 23, !dbg !324 + %7761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 24, !dbg !324 + %7762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 25, !dbg !324 + %7763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 26, !dbg !324 + %7764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 27, !dbg !324 + %7765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 28, !dbg !324 + %7766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 29, !dbg !324 + %7767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 30, !dbg !324 + %7768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 31, !dbg !324 + %7769 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7737, float %7738, float %7739, float %7740, float %7741, float %7742, float %7743, float %7744, float %7745, float %7746, float %7747, float %7748, float %7749, float %7750, float %7751, float %7752, float %7753, float %7754, float %7755, float %7756, float %7757, float %7758, float %7759, float %7760, float %7761, float %7762, float %7763, float %7764, float %7765, float %7766, float %7767, float %7768, i64 %7632, i64 %7637, i1 true) #3, !dbg !324 + %7770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 0, !dbg !324 + %7771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 1, !dbg !324 + %7772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 2, !dbg !324 + %7773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 3, !dbg !324 + %7774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 4, !dbg !324 + %7775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 5, !dbg !324 + %7776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 6, !dbg !324 + %7777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 7, !dbg !324 + %7778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 8, !dbg !324 + %7779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 9, !dbg !324 + %7780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 10, !dbg !324 + %7781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 11, !dbg !324 + %7782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 12, !dbg !324 + %7783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 13, !dbg !324 + %7784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 14, !dbg !324 + %7785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 15, !dbg !324 + %7786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 16, !dbg !324 + %7787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 17, !dbg !324 + %7788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 18, !dbg !324 + %7789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 19, !dbg !324 + %7790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 20, !dbg !324 + %7791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 21, !dbg !324 + %7792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 22, !dbg !324 + %7793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 23, !dbg !324 + %7794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 24, !dbg !324 + %7795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 25, !dbg !324 + %7796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 26, !dbg !324 + %7797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 27, !dbg !324 + %7798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 28, !dbg !324 + %7799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 29, !dbg !324 + %7800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 30, !dbg !324 + %7801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 31, !dbg !324 + %7802 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7770, float %7771, float %7772, float %7773, float %7774, float %7775, float %7776, float %7777, float %7778, float %7779, float %7780, float %7781, float %7782, float %7783, float %7784, float %7785, float %7786, float %7787, float %7788, float %7789, float %7790, float %7791, float %7792, float %7793, float %7794, float %7795, float %7796, float %7797, float %7798, float %7799, float %7800, float %7801, i64 %7642, i64 %7647, i1 true) #3, !dbg !324 + %7803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 0, !dbg !324 + %7804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 1, !dbg !324 + %7805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 2, !dbg !324 + %7806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 3, !dbg !324 + %7807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 4, !dbg !324 + %7808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 5, !dbg !324 + %7809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 6, !dbg !324 + %7810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 7, !dbg !324 + %7811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 8, !dbg !324 + %7812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 9, !dbg !324 + %7813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 10, !dbg !324 + %7814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 11, !dbg !324 + %7815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 12, !dbg !324 + %7816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 13, !dbg !324 + %7817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 14, !dbg !324 + %7818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 15, !dbg !324 + %7819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 16, !dbg !324 + %7820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 17, !dbg !324 + %7821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 18, !dbg !324 + %7822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 19, !dbg !324 + %7823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 20, !dbg !324 + %7824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 21, !dbg !324 + %7825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 22, !dbg !324 + %7826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 23, !dbg !324 + %7827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 24, !dbg !324 + %7828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 25, !dbg !324 + %7829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 26, !dbg !324 + %7830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 27, !dbg !324 + %7831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 28, !dbg !324 + %7832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 29, !dbg !324 + %7833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 30, !dbg !324 + %7834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 31, !dbg !324 + %7835 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7803, float %7804, float %7805, float %7806, float %7807, float %7808, float %7809, float %7810, float %7811, float %7812, float %7813, float %7814, float %7815, float %7816, float %7817, float %7818, float %7819, float %7820, float %7821, float %7822, float %7823, float %7824, float %7825, float %7826, float %7827, float %7828, float %7829, float %7830, float %7831, float %7832, float %7833, float %7834, i64 %7652, i64 %7657, i1 true) #3, !dbg !324 + %7836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 0, !dbg !324 + %7837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 1, !dbg !324 + %7838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 2, !dbg !324 + %7839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 3, !dbg !324 + %7840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 4, !dbg !324 + %7841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 5, !dbg !324 + %7842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 6, !dbg !324 + %7843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 7, !dbg !324 + %7844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 8, !dbg !324 + %7845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 9, !dbg !324 + %7846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 10, !dbg !324 + %7847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 11, !dbg !324 + %7848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 12, !dbg !324 + %7849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 13, !dbg !324 + %7850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 14, !dbg !324 + %7851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 15, !dbg !324 + %7852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 16, !dbg !324 + %7853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 17, !dbg !324 + %7854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 18, !dbg !324 + %7855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 19, !dbg !324 + %7856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 20, !dbg !324 + %7857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 21, !dbg !324 + %7858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 22, !dbg !324 + %7859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 23, !dbg !324 + %7860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 24, !dbg !324 + %7861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 25, !dbg !324 + %7862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 26, !dbg !324 + %7863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 27, !dbg !324 + %7864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 28, !dbg !324 + %7865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 29, !dbg !324 + %7866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 30, !dbg !324 + %7867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 31, !dbg !324 + %7868 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7836, float %7837, float %7838, float %7839, float %7840, float %7841, float %7842, float %7843, float %7844, float %7845, float %7846, float %7847, float %7848, float %7849, float %7850, float %7851, float %7852, float %7853, float %7854, float %7855, float %7856, float %7857, float %7858, float %7859, float %7860, float %7861, float %7862, float %7863, float %7864, float %7865, float %7866, float %7867, i64 %7662, i64 %7667, i1 true) #3, !dbg !324 + %7869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 0, !dbg !324 + %7870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 1, !dbg !324 + %7871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 2, !dbg !324 + %7872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 3, !dbg !324 + %7873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 4, !dbg !324 + %7874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 5, !dbg !324 + %7875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 6, !dbg !324 + %7876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 7, !dbg !324 + %7877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 8, !dbg !324 + %7878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 9, !dbg !324 + %7879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 10, !dbg !324 + %7880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 11, !dbg !324 + %7881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 12, !dbg !324 + %7882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 13, !dbg !324 + %7883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 14, !dbg !324 + %7884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 15, !dbg !324 + %7885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 16, !dbg !324 + %7886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 17, !dbg !324 + %7887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 18, !dbg !324 + %7888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 19, !dbg !324 + %7889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 20, !dbg !324 + %7890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 21, !dbg !324 + %7891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 22, !dbg !324 + %7892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 23, !dbg !324 + %7893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 24, !dbg !324 + %7894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 25, !dbg !324 + %7895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 26, !dbg !324 + %7896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 27, !dbg !324 + %7897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 28, !dbg !324 + %7898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 29, !dbg !324 + %7899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 30, !dbg !324 + %7900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 31, !dbg !324 + %7901 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7869, float %7870, float %7871, float %7872, float %7873, float %7874, float %7875, float %7876, float %7877, float %7878, float %7879, float %7880, float %7881, float %7882, float %7883, float %7884, float %7885, float %7886, float %7887, float %7888, float %7889, float %7890, float %7891, float %7892, float %7893, float %7894, float %7895, float %7896, float %7897, float %7898, float %7899, float %7900, i64 %7672, i64 %7677, i1 true) #3, !dbg !324 + %7902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 0, !dbg !324 + %7903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 1, !dbg !324 + %7904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 2, !dbg !324 + %7905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 3, !dbg !324 + %7906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 4, !dbg !324 + %7907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 5, !dbg !324 + %7908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 6, !dbg !324 + %7909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 7, !dbg !324 + %7910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 8, !dbg !324 + %7911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 9, !dbg !324 + %7912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 10, !dbg !324 + %7913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 11, !dbg !324 + %7914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 12, !dbg !324 + %7915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 13, !dbg !324 + %7916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 14, !dbg !324 + %7917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 15, !dbg !324 + %7918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 16, !dbg !324 + %7919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 17, !dbg !324 + %7920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 18, !dbg !324 + %7921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 19, !dbg !324 + %7922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 20, !dbg !324 + %7923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 21, !dbg !324 + %7924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 22, !dbg !324 + %7925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 23, !dbg !324 + %7926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 24, !dbg !324 + %7927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 25, !dbg !324 + %7928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 26, !dbg !324 + %7929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 27, !dbg !324 + %7930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 28, !dbg !324 + %7931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 29, !dbg !324 + %7932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 30, !dbg !324 + %7933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 31, !dbg !324 + %7934 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7902, float %7903, float %7904, float %7905, float %7906, float %7907, float %7908, float %7909, float %7910, float %7911, float %7912, float %7913, float %7914, float %7915, float %7916, float %7917, float %7918, float %7919, float %7920, float %7921, float %7922, float %7923, float %7924, float %7925, float %7926, float %7927, float %7928, float %7929, float %7930, float %7931, float %7932, float %7933, i64 %7682, i64 %7687, i1 true) #3, !dbg !324 + %7935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 0, !dbg !324 + %7936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 1, !dbg !324 + %7937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 2, !dbg !324 + %7938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 3, !dbg !324 + %7939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 4, !dbg !324 + %7940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 5, !dbg !324 + %7941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 6, !dbg !324 + %7942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 7, !dbg !324 + %7943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 8, !dbg !324 + %7944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 9, !dbg !324 + %7945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 10, !dbg !324 + %7946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 11, !dbg !324 + %7947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 12, !dbg !324 + %7948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 13, !dbg !324 + %7949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 14, !dbg !324 + %7950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 15, !dbg !324 + %7951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 16, !dbg !324 + %7952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 17, !dbg !324 + %7953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 18, !dbg !324 + %7954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 19, !dbg !324 + %7955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 20, !dbg !324 + %7956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 21, !dbg !324 + %7957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 22, !dbg !324 + %7958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 23, !dbg !324 + %7959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 24, !dbg !324 + %7960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 25, !dbg !324 + %7961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 26, !dbg !324 + %7962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 27, !dbg !324 + %7963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 28, !dbg !324 + %7964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 29, !dbg !324 + %7965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 30, !dbg !324 + %7966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 31, !dbg !324 + %7967 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7935, float %7936, float %7937, float %7938, float %7939, float %7940, float %7941, float %7942, float %7943, float %7944, float %7945, float %7946, float %7947, float %7948, float %7949, float %7950, float %7951, float %7952, float %7953, float %7954, float %7955, float %7956, float %7957, float %7958, float %7959, float %7960, float %7961, float %7962, float %7963, float %7964, float %7965, float %7966, i64 %7692, i64 %7697, i1 true) #3, !dbg !324 + %7968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 0, !dbg !324 + %7969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 1, !dbg !324 + %7970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 2, !dbg !324 + %7971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 3, !dbg !324 + %7972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 4, !dbg !324 + %7973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 5, !dbg !324 + %7974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 6, !dbg !324 + %7975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 7, !dbg !324 + %7976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 8, !dbg !324 + %7977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 9, !dbg !324 + %7978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 10, !dbg !324 + %7979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 11, !dbg !324 + %7980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 12, !dbg !324 + %7981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 13, !dbg !324 + %7982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 14, !dbg !324 + %7983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 15, !dbg !324 + %7984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 16, !dbg !324 + %7985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 17, !dbg !324 + %7986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 18, !dbg !324 + %7987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 19, !dbg !324 + %7988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 20, !dbg !324 + %7989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 21, !dbg !324 + %7990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 22, !dbg !324 + %7991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 23, !dbg !324 + %7992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 24, !dbg !324 + %7993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 25, !dbg !324 + %7994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 26, !dbg !324 + %7995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 27, !dbg !324 + %7996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 28, !dbg !324 + %7997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 29, !dbg !324 + %7998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 30, !dbg !324 + %7999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 31, !dbg !324 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !324 + %8000 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %7968, float %7969, float %7970, float %7971, float %7972, float %7973, float %7974, float %7975, float %7976, float %7977, float %7978, float %7979, float %7980, float %7981, float %7982, float %7983, float %7984, float %7985, float %7986, float %7987, float %7988, float %7989, float %7990, float %7991, float %7992, float %7993, float %7994, float %7995, float %7996, float %7997, float %7998, float %7999, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %7248, i32 0, i32 0) #3, !dbg !324 + %8001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 0, !dbg !324 + %8002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 1, !dbg !324 + %8003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 2, !dbg !324 + %8004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 3, !dbg !324 + %8005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 4, !dbg !324 + %8006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 5, !dbg !324 + %8007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 6, !dbg !324 + %8008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 7, !dbg !324 + %8009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 8, !dbg !324 + %8010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 9, !dbg !324 + %8011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 10, !dbg !324 + %8012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 11, !dbg !324 + %8013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 12, !dbg !324 + %8014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 13, !dbg !324 + %8015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 14, !dbg !324 + %8016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 15, !dbg !324 + %8017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 16, !dbg !324 + %8018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 17, !dbg !324 + %8019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 18, !dbg !324 + %8020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 19, !dbg !324 + %8021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 20, !dbg !324 + %8022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 21, !dbg !324 + %8023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 22, !dbg !324 + %8024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 23, !dbg !324 + %8025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 24, !dbg !324 + %8026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 25, !dbg !324 + %8027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 26, !dbg !324 + %8028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 27, !dbg !324 + %8029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 28, !dbg !324 + %8030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 29, !dbg !324 + %8031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 30, !dbg !324 + %8032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 31, !dbg !324 + %8033 = insertelement <2 x float> poison, float %8003, i64 0, !dbg !335 + %8034 = insertelement <2 x float> %8033, float %8004, i64 1, !dbg !335 + %8035 = fsub <2 x float> %8034, %7735, !dbg !335 + %8036 = fsub float %8005, %7596, !dbg !335 + %8037 = fsub float %8006, %7598, !dbg !335 + %8038 = fsub float %8007, %7596, !dbg !335 + %8039 = fsub float %8008, %7598, !dbg !335 + %8040 = fsub float %8009, %7600, !dbg !335 + %8041 = fsub float %8010, %7602, !dbg !335 + %8042 = fsub float %8011, %7600, !dbg !335 + %8043 = fsub float %8012, %7602, !dbg !335 + %8044 = fsub float %8013, %7604, !dbg !335 + %8045 = fsub float %8014, %7606, !dbg !335 + %8046 = fsub float %8015, %7604, !dbg !335 + %8047 = fsub float %8016, %7606, !dbg !335 + %8048 = fsub float %8017, %7608, !dbg !335 + %8049 = fsub float %8018, %7610, !dbg !335 + %8050 = fsub float %8019, %7608, !dbg !335 + %8051 = fsub float %8020, %7610, !dbg !335 + %8052 = fsub float %8021, %7612, !dbg !335 + %8053 = fsub float %8022, %7614, !dbg !335 + %8054 = fsub float %8023, %7612, !dbg !335 + %8055 = fsub float %8024, %7614, !dbg !335 + %8056 = fsub float %8025, %7616, !dbg !335 + %8057 = fsub float %8026, %7618, !dbg !335 + %8058 = fsub float %8027, %7616, !dbg !335 + %8059 = fsub float %8028, %7618, !dbg !335 + %8060 = fsub float %8029, %7620, !dbg !335 + %8061 = fsub float %8030, %7622, !dbg !335 + %8062 = fsub float %8031, %7620, !dbg !335 + %8063 = fsub float %8032, %7622, !dbg !335 + %8064 = fmul <2 x float> %7253, %8035, !dbg !336 + %8065 = fmul float %.0.i1253, %8036, !dbg !336 + %8066 = fmul float %.0.i1256, %8037, !dbg !336 + %8067 = fmul float %.0.i1259, %8038, !dbg !336 + %8068 = fmul float %.0.i1262, %8039, !dbg !336 + %8069 = fmul float %.0.i1265, %8040, !dbg !336 + %8070 = fmul float %.0.i1268, %8041, !dbg !336 + %8071 = fmul float %.0.i1271, %8042, !dbg !336 + %8072 = fmul float %.0.i1274, %8043, !dbg !336 + %8073 = fmul float %.0.i1277, %8044, !dbg !336 + %8074 = fmul float %.0.i1280, %8045, !dbg !336 + %8075 = fmul float %.0.i1283, %8046, !dbg !336 + %8076 = fmul float %.0.i1286, %8047, !dbg !336 + %8077 = fmul float %.0.i1289, %8048, !dbg !336 + %8078 = fmul float %.0.i1292, %8049, !dbg !336 + %8079 = fmul float %.0.i1295, %8050, !dbg !336 + %8080 = fmul float %.0.i1298, %8051, !dbg !336 + %8081 = fmul float %.0.i1301, %8052, !dbg !336 + %8082 = fmul float %.0.i1304, %8053, !dbg !336 + %8083 = fmul float %.0.i1307, %8054, !dbg !336 + %8084 = fmul float %.0.i1310, %8055, !dbg !336 + %8085 = fmul float %.0.i1313, %8056, !dbg !336 + %8086 = fmul float %.0.i1316, %8057, !dbg !336 + %8087 = fmul float %.0.i1319, %8058, !dbg !336 + %8088 = fmul float %.0.i1322, %8059, !dbg !336 + %8089 = fmul float %.0.i1325, %8060, !dbg !336 + %8090 = fmul float %.0.i1328, %8061, !dbg !336 + %8091 = fmul float %.0.i1331, %8062, !dbg !336 + %8092 = fmul float %.0.i1334, %8063, !dbg !336 + %8093 = insertelement <2 x float> poison, float %8001, i64 0, !dbg !335 + %8094 = insertelement <2 x float> %8093, float %8002, i64 1, !dbg !335 + %8095 = fsub <2 x float> %8094, %7735, !dbg !335 + %8096 = fmul <2 x float> %7250, %8095, !dbg !336 + %8097 = fptrunc <2 x float> %8096 to <2 x bfloat>, !dbg !337 + %8098 = select <2 x i1> %6958, <2 x bfloat> %8097, <2 x bfloat> zeroinitializer, !dbg !338 + %8099 = fptrunc <2 x float> %8064 to <2 x bfloat>, !dbg !337 + %8100 = select <2 x i1> %6959, <2 x bfloat> %8099, <2 x bfloat> zeroinitializer, !dbg !338 + %8101 = fptrunc float %8065 to bfloat, !dbg !337 + %8102 = select i1 %6960, bfloat %8101, bfloat 0xR0000, !dbg !338 + %8103 = fptrunc float %8066 to bfloat, !dbg !337 + %8104 = select i1 %6961, bfloat %8103, bfloat 0xR0000, !dbg !338 + %8105 = fptrunc float %8067 to bfloat, !dbg !337 + %8106 = select i1 %6962, bfloat %8105, bfloat 0xR0000, !dbg !338 + %8107 = fptrunc float %8068 to bfloat, !dbg !337 + %8108 = select i1 %6963, bfloat %8107, bfloat 0xR0000, !dbg !338 + %8109 = fptrunc float %8069 to bfloat, !dbg !337 + %8110 = select i1 %6964, bfloat %8109, bfloat 0xR0000, !dbg !338 + %8111 = fptrunc float %8070 to bfloat, !dbg !337 + %8112 = select i1 %6965, bfloat %8111, bfloat 0xR0000, !dbg !338 + %8113 = fptrunc float %8071 to bfloat, !dbg !337 + %8114 = select i1 %6966, bfloat %8113, bfloat 0xR0000, !dbg !338 + %8115 = fptrunc float %8072 to bfloat, !dbg !337 + %8116 = select i1 %6967, bfloat %8115, bfloat 0xR0000, !dbg !338 + %8117 = fptrunc float %8073 to bfloat, !dbg !337 + %8118 = select i1 %6968, bfloat %8117, bfloat 0xR0000, !dbg !338 + %8119 = fptrunc float %8074 to bfloat, !dbg !337 + %8120 = select i1 %6969, bfloat %8119, bfloat 0xR0000, !dbg !338 + %8121 = fptrunc float %8075 to bfloat, !dbg !337 + %8122 = select i1 %6970, bfloat %8121, bfloat 0xR0000, !dbg !338 + %8123 = fptrunc float %8076 to bfloat, !dbg !337 + %8124 = select i1 %6971, bfloat %8123, bfloat 0xR0000, !dbg !338 + %8125 = fptrunc float %8077 to bfloat, !dbg !337 + %8126 = select i1 %6972, bfloat %8125, bfloat 0xR0000, !dbg !338 + %8127 = fptrunc float %8078 to bfloat, !dbg !337 + %8128 = select i1 %6973, bfloat %8127, bfloat 0xR0000, !dbg !338 + %8129 = fptrunc float %8079 to bfloat, !dbg !337 + %8130 = select i1 %6974, bfloat %8129, bfloat 0xR0000, !dbg !338 + %8131 = fptrunc float %8080 to bfloat, !dbg !337 + %8132 = select i1 %6975, bfloat %8131, bfloat 0xR0000, !dbg !338 + %8133 = fptrunc float %8081 to bfloat, !dbg !337 + %8134 = select i1 %6976, bfloat %8133, bfloat 0xR0000, !dbg !338 + %8135 = fptrunc float %8082 to bfloat, !dbg !337 + %8136 = select i1 %6977, bfloat %8135, bfloat 0xR0000, !dbg !338 + %8137 = fptrunc float %8083 to bfloat, !dbg !337 + %8138 = select i1 %6978, bfloat %8137, bfloat 0xR0000, !dbg !338 + %8139 = fptrunc float %8084 to bfloat, !dbg !337 + %8140 = select i1 %6979, bfloat %8139, bfloat 0xR0000, !dbg !338 + %8141 = fptrunc float %8085 to bfloat, !dbg !337 + %8142 = select i1 %6980, bfloat %8141, bfloat 0xR0000, !dbg !338 + %8143 = fptrunc float %8086 to bfloat, !dbg !337 + %8144 = select i1 %6981, bfloat %8143, bfloat 0xR0000, !dbg !338 + %8145 = fptrunc float %8087 to bfloat, !dbg !337 + %8146 = select i1 %6982, bfloat %8145, bfloat 0xR0000, !dbg !338 + %8147 = fptrunc float %8088 to bfloat, !dbg !337 + %8148 = select i1 %6983, bfloat %8147, bfloat 0xR0000, !dbg !338 + %8149 = fptrunc float %8089 to bfloat, !dbg !337 + %8150 = select i1 %6984, bfloat %8149, bfloat 0xR0000, !dbg !338 + %8151 = fptrunc float %8090 to bfloat, !dbg !337 + %8152 = select i1 %6985, bfloat %8151, bfloat 0xR0000, !dbg !338 + %8153 = fptrunc float %8091 to bfloat, !dbg !337 + %8154 = select i1 %6986, bfloat %8153, bfloat 0xR0000, !dbg !338 + %8155 = fptrunc float %8092 to bfloat, !dbg !337 + %8156 = select i1 %6987, bfloat %8155, bfloat 0xR0000, !dbg !338 + %8157 = bitcast <2 x bfloat> %8098 to i32, !dbg !325 + %8158 = bitcast <2 x bfloat> %8100 to i32, !dbg !325 + %8159 = insertelement <2 x bfloat> poison, bfloat %8102, i64 0, !dbg !325 + %8160 = insertelement <2 x bfloat> %8159, bfloat %8104, i64 1, !dbg !325 + %8161 = bitcast <2 x bfloat> %8160 to i32, !dbg !325 + %8162 = insertelement <2 x bfloat> poison, bfloat %8106, i64 0, !dbg !325 + %8163 = insertelement <2 x bfloat> %8162, bfloat %8108, i64 1, !dbg !325 + %8164 = bitcast <2 x bfloat> %8163 to i32, !dbg !325 + %8165 = insertelement <2 x bfloat> poison, bfloat %8110, i64 0, !dbg !325 + %8166 = insertelement <2 x bfloat> %8165, bfloat %8112, i64 1, !dbg !325 + %8167 = bitcast <2 x bfloat> %8166 to i32, !dbg !325 + %8168 = insertelement <2 x bfloat> poison, bfloat %8114, i64 0, !dbg !325 + %8169 = insertelement <2 x bfloat> %8168, bfloat %8116, i64 1, !dbg !325 + %8170 = bitcast <2 x bfloat> %8169 to i32, !dbg !325 + %8171 = insertelement <2 x bfloat> poison, bfloat %8118, i64 0, !dbg !325 + %8172 = insertelement <2 x bfloat> %8171, bfloat %8120, i64 1, !dbg !325 + %8173 = bitcast <2 x bfloat> %8172 to i32, !dbg !325 + %8174 = insertelement <2 x bfloat> poison, bfloat %8122, i64 0, !dbg !325 + %8175 = insertelement <2 x bfloat> %8174, bfloat %8124, i64 1, !dbg !325 + %8176 = bitcast <2 x bfloat> %8175 to i32, !dbg !325 + %8177 = insertelement <2 x bfloat> poison, bfloat %8126, i64 0, !dbg !325 + %8178 = insertelement <2 x bfloat> %8177, bfloat %8128, i64 1, !dbg !325 + %8179 = bitcast <2 x bfloat> %8178 to i32, !dbg !325 + %8180 = insertelement <2 x bfloat> poison, bfloat %8130, i64 0, !dbg !325 + %8181 = insertelement <2 x bfloat> %8180, bfloat %8132, i64 1, !dbg !325 + %8182 = bitcast <2 x bfloat> %8181 to i32, !dbg !325 + %8183 = insertelement <2 x bfloat> poison, bfloat %8134, i64 0, !dbg !325 + %8184 = insertelement <2 x bfloat> %8183, bfloat %8136, i64 1, !dbg !325 + %8185 = bitcast <2 x bfloat> %8184 to i32, !dbg !325 + %8186 = insertelement <2 x bfloat> poison, bfloat %8138, i64 0, !dbg !325 + %8187 = insertelement <2 x bfloat> %8186, bfloat %8140, i64 1, !dbg !325 + %8188 = bitcast <2 x bfloat> %8187 to i32, !dbg !325 + %8189 = insertelement <2 x bfloat> poison, bfloat %8142, i64 0, !dbg !325 + %8190 = insertelement <2 x bfloat> %8189, bfloat %8144, i64 1, !dbg !325 + %8191 = bitcast <2 x bfloat> %8190 to i32, !dbg !325 + %8192 = insertelement <2 x bfloat> poison, bfloat %8146, i64 0, !dbg !325 + %8193 = insertelement <2 x bfloat> %8192, bfloat %8148, i64 1, !dbg !325 + %8194 = bitcast <2 x bfloat> %8193 to i32, !dbg !325 + %8195 = insertelement <2 x bfloat> poison, bfloat %8150, i64 0, !dbg !325 + %8196 = insertelement <2 x bfloat> %8195, bfloat %8152, i64 1, !dbg !325 + %8197 = bitcast <2 x bfloat> %8196 to i32, !dbg !325 + %8198 = insertelement <2 x bfloat> poison, bfloat %8154, i64 0, !dbg !325 + %8199 = insertelement <2 x bfloat> %8198, bfloat %8156, i64 1, !dbg !325 + %8200 = bitcast <2 x bfloat> %8199 to i32, !dbg !325 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !325 + %8201 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6087, float %6088, float %6089, float %6090, float %6091, float %6092, float %6093, float %6094, float %6095, float %6096, float %6097, float %6098, float %6099, float %6100, float %6101, float %6102, float %6103, float %6104, float %6105, float %6106, float %6107, float %6108, float %6109, float %6110, float %6111, float %6112, float %6113, float %6114, float %6115, float %6116, float %6117, float %6118, float %6119, float %6120, float %6121, float %6122, float %6123, float %6124, float %6125, float %6126, float %6127, float %6128, float %6129, float %6130, float %6131, float %6132, float %6133, float %6134, float %6135, float %6136, float %6137, float %6138, float %6139, float %6140, float %6141, float %6142, float %6143, float %6144, float %6145, float %6146, float %6147, float %6148, float %6149, float %6150, i32 %8157, i32 %8158, i32 %8161, i32 %8164, i64 %6260, i1 true) #3, !dbg !325 + %8202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 0, !dbg !325 + %8203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 1, !dbg !325 + %8204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 2, !dbg !325 + %8205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 3, !dbg !325 + %8206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 4, !dbg !325 + %8207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 5, !dbg !325 + %8208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 6, !dbg !325 + %8209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 7, !dbg !325 + %8210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 8, !dbg !325 + %8211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 9, !dbg !325 + %8212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 10, !dbg !325 + %8213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 11, !dbg !325 + %8214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 12, !dbg !325 + %8215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 13, !dbg !325 + %8216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 14, !dbg !325 + %8217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 15, !dbg !325 + %8218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 16, !dbg !325 + %8219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 17, !dbg !325 + %8220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 18, !dbg !325 + %8221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 19, !dbg !325 + %8222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 20, !dbg !325 + %8223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 21, !dbg !325 + %8224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 22, !dbg !325 + %8225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 23, !dbg !325 + %8226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 24, !dbg !325 + %8227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 25, !dbg !325 + %8228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 26, !dbg !325 + %8229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 27, !dbg !325 + %8230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 28, !dbg !325 + %8231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 29, !dbg !325 + %8232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 30, !dbg !325 + %8233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 31, !dbg !325 + %8234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 32, !dbg !325 + %8235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 33, !dbg !325 + %8236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 34, !dbg !325 + %8237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 35, !dbg !325 + %8238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 36, !dbg !325 + %8239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 37, !dbg !325 + %8240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 38, !dbg !325 + %8241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 39, !dbg !325 + %8242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 40, !dbg !325 + %8243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 41, !dbg !325 + %8244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 42, !dbg !325 + %8245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 43, !dbg !325 + %8246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 44, !dbg !325 + %8247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 45, !dbg !325 + %8248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 46, !dbg !325 + %8249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 47, !dbg !325 + %8250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 48, !dbg !325 + %8251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 49, !dbg !325 + %8252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 50, !dbg !325 + %8253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 51, !dbg !325 + %8254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 52, !dbg !325 + %8255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 53, !dbg !325 + %8256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 54, !dbg !325 + %8257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 55, !dbg !325 + %8258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 56, !dbg !325 + %8259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 57, !dbg !325 + %8260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 58, !dbg !325 + %8261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 59, !dbg !325 + %8262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 60, !dbg !325 + %8263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 61, !dbg !325 + %8264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 62, !dbg !325 + %8265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 63, !dbg !325 + %8266 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8202, float %8203, float %8204, float %8205, float %8206, float %8207, float %8208, float %8209, float %8210, float %8211, float %8212, float %8213, float %8214, float %8215, float %8216, float %8217, float %8218, float %8219, float %8220, float %8221, float %8222, float %8223, float %8224, float %8225, float %8226, float %8227, float %8228, float %8229, float %8230, float %8231, float %8232, float %8233, float %8234, float %8235, float %8236, float %8237, float %8238, float %8239, float %8240, float %8241, float %8242, float %8243, float %8244, float %8245, float %8246, float %8247, float %8248, float %8249, float %8250, float %8251, float %8252, float %8253, float %8254, float %8255, float %8256, float %8257, float %8258, float %8259, float %8260, float %8261, float %8262, float %8263, float %8264, float %8265, i32 %8167, i32 %8170, i32 %8173, i32 %8176, i64 %7702, i1 true) #3, !dbg !325 + %8267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 0, !dbg !325 + %8268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 1, !dbg !325 + %8269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 2, !dbg !325 + %8270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 3, !dbg !325 + %8271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 4, !dbg !325 + %8272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 5, !dbg !325 + %8273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 6, !dbg !325 + %8274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 7, !dbg !325 + %8275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 8, !dbg !325 + %8276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 9, !dbg !325 + %8277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 10, !dbg !325 + %8278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 11, !dbg !325 + %8279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 12, !dbg !325 + %8280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 13, !dbg !325 + %8281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 14, !dbg !325 + %8282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 15, !dbg !325 + %8283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 16, !dbg !325 + %8284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 17, !dbg !325 + %8285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 18, !dbg !325 + %8286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 19, !dbg !325 + %8287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 20, !dbg !325 + %8288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 21, !dbg !325 + %8289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 22, !dbg !325 + %8290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 23, !dbg !325 + %8291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 24, !dbg !325 + %8292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 25, !dbg !325 + %8293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 26, !dbg !325 + %8294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 27, !dbg !325 + %8295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 28, !dbg !325 + %8296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 29, !dbg !325 + %8297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 30, !dbg !325 + %8298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 31, !dbg !325 + %8299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 32, !dbg !325 + %8300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 33, !dbg !325 + %8301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 34, !dbg !325 + %8302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 35, !dbg !325 + %8303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 36, !dbg !325 + %8304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 37, !dbg !325 + %8305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 38, !dbg !325 + %8306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 39, !dbg !325 + %8307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 40, !dbg !325 + %8308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 41, !dbg !325 + %8309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 42, !dbg !325 + %8310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 43, !dbg !325 + %8311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 44, !dbg !325 + %8312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 45, !dbg !325 + %8313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 46, !dbg !325 + %8314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 47, !dbg !325 + %8315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 48, !dbg !325 + %8316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 49, !dbg !325 + %8317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 50, !dbg !325 + %8318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 51, !dbg !325 + %8319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 52, !dbg !325 + %8320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 53, !dbg !325 + %8321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 54, !dbg !325 + %8322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 55, !dbg !325 + %8323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 56, !dbg !325 + %8324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 57, !dbg !325 + %8325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 58, !dbg !325 + %8326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 59, !dbg !325 + %8327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 60, !dbg !325 + %8328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 61, !dbg !325 + %8329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 62, !dbg !325 + %8330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 63, !dbg !325 + %8331 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8267, float %8268, float %8269, float %8270, float %8271, float %8272, float %8273, float %8274, float %8275, float %8276, float %8277, float %8278, float %8279, float %8280, float %8281, float %8282, float %8283, float %8284, float %8285, float %8286, float %8287, float %8288, float %8289, float %8290, float %8291, float %8292, float %8293, float %8294, float %8295, float %8296, float %8297, float %8298, float %8299, float %8300, float %8301, float %8302, float %8303, float %8304, float %8305, float %8306, float %8307, float %8308, float %8309, float %8310, float %8311, float %8312, float %8313, float %8314, float %8315, float %8316, float %8317, float %8318, float %8319, float %8320, float %8321, float %8322, float %8323, float %8324, float %8325, float %8326, float %8327, float %8328, float %8329, float %8330, i32 %8179, i32 %8182, i32 %8185, i32 %8188, i64 %7707, i1 true) #3, !dbg !325 + %8332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 0, !dbg !325 + %8333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 1, !dbg !325 + %8334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 2, !dbg !325 + %8335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 3, !dbg !325 + %8336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 4, !dbg !325 + %8337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 5, !dbg !325 + %8338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 6, !dbg !325 + %8339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 7, !dbg !325 + %8340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 8, !dbg !325 + %8341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 9, !dbg !325 + %8342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 10, !dbg !325 + %8343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 11, !dbg !325 + %8344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 12, !dbg !325 + %8345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 13, !dbg !325 + %8346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 14, !dbg !325 + %8347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 15, !dbg !325 + %8348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 16, !dbg !325 + %8349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 17, !dbg !325 + %8350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 18, !dbg !325 + %8351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 19, !dbg !325 + %8352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 20, !dbg !325 + %8353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 21, !dbg !325 + %8354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 22, !dbg !325 + %8355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 23, !dbg !325 + %8356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 24, !dbg !325 + %8357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 25, !dbg !325 + %8358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 26, !dbg !325 + %8359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 27, !dbg !325 + %8360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 28, !dbg !325 + %8361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 29, !dbg !325 + %8362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 30, !dbg !325 + %8363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 31, !dbg !325 + %8364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 32, !dbg !325 + %8365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 33, !dbg !325 + %8366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 34, !dbg !325 + %8367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 35, !dbg !325 + %8368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 36, !dbg !325 + %8369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 37, !dbg !325 + %8370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 38, !dbg !325 + %8371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 39, !dbg !325 + %8372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 40, !dbg !325 + %8373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 41, !dbg !325 + %8374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 42, !dbg !325 + %8375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 43, !dbg !325 + %8376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 44, !dbg !325 + %8377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 45, !dbg !325 + %8378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 46, !dbg !325 + %8379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 47, !dbg !325 + %8380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 48, !dbg !325 + %8381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 49, !dbg !325 + %8382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 50, !dbg !325 + %8383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 51, !dbg !325 + %8384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 52, !dbg !325 + %8385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 53, !dbg !325 + %8386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 54, !dbg !325 + %8387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 55, !dbg !325 + %8388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 56, !dbg !325 + %8389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 57, !dbg !325 + %8390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 58, !dbg !325 + %8391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 59, !dbg !325 + %8392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 60, !dbg !325 + %8393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 61, !dbg !325 + %8394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 62, !dbg !325 + %8395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 63, !dbg !325 + %8396 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8332, float %8333, float %8334, float %8335, float %8336, float %8337, float %8338, float %8339, float %8340, float %8341, float %8342, float %8343, float %8344, float %8345, float %8346, float %8347, float %8348, float %8349, float %8350, float %8351, float %8352, float %8353, float %8354, float %8355, float %8356, float %8357, float %8358, float %8359, float %8360, float %8361, float %8362, float %8363, float %8364, float %8365, float %8366, float %8367, float %8368, float %8369, float %8370, float %8371, float %8372, float %8373, float %8374, float %8375, float %8376, float %8377, float %8378, float %8379, float %8380, float %8381, float %8382, float %8383, float %8384, float %8385, float %8386, float %8387, float %8388, float %8389, float %8390, float %8391, float %8392, float %8393, float %8394, float %8395, i32 %8191, i32 %8194, i32 %8197, i32 %8200, i64 %7712, i1 true) #3, !dbg !325 + %8397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 0, !dbg !325 + %8398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 1, !dbg !325 + %8399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 2, !dbg !325 + %8400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 3, !dbg !325 + %8401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 4, !dbg !325 + %8402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 5, !dbg !325 + %8403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 6, !dbg !325 + %8404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 7, !dbg !325 + %8405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 8, !dbg !325 + %8406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 9, !dbg !325 + %8407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 10, !dbg !325 + %8408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 11, !dbg !325 + %8409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 12, !dbg !325 + %8410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 13, !dbg !325 + %8411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 14, !dbg !325 + %8412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 15, !dbg !325 + %8413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 16, !dbg !325 + %8414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 17, !dbg !325 + %8415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 18, !dbg !325 + %8416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 19, !dbg !325 + %8417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 20, !dbg !325 + %8418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 21, !dbg !325 + %8419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 22, !dbg !325 + %8420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 23, !dbg !325 + %8421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 24, !dbg !325 + %8422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 25, !dbg !325 + %8423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 26, !dbg !325 + %8424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 27, !dbg !325 + %8425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 28, !dbg !325 + %8426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 29, !dbg !325 + %8427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 30, !dbg !325 + %8428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 31, !dbg !325 + %8429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 32, !dbg !325 + %8430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 33, !dbg !325 + %8431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 34, !dbg !325 + %8432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 35, !dbg !325 + %8433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 36, !dbg !325 + %8434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 37, !dbg !325 + %8435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 38, !dbg !325 + %8436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 39, !dbg !325 + %8437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 40, !dbg !325 + %8438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 41, !dbg !325 + %8439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 42, !dbg !325 + %8440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 43, !dbg !325 + %8441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 44, !dbg !325 + %8442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 45, !dbg !325 + %8443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 46, !dbg !325 + %8444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 47, !dbg !325 + %8445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 48, !dbg !325 + %8446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 49, !dbg !325 + %8447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 50, !dbg !325 + %8448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 51, !dbg !325 + %8449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 52, !dbg !325 + %8450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 53, !dbg !325 + %8451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 54, !dbg !325 + %8452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 55, !dbg !325 + %8453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 56, !dbg !325 + %8454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 57, !dbg !325 + %8455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 58, !dbg !325 + %8456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 59, !dbg !325 + %8457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 60, !dbg !325 + %8458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 61, !dbg !325 + %8459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 62, !dbg !325 + %8460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 63, !dbg !325 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !325 + %8461 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !339 + %8462 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7727, i64 %8461, i1 %6158) #3, !dbg !339 + %8463 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !340 + %8464 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7730, i64 %8463, i1 %7731) #3, !dbg !340 + %8465 = sub i32 %8464, %8462, !dbg !341 + %8466 = shl i32 %8465, 7, !dbg !342 + %8467 = add i32 %8466, -64, !dbg !343 + %8468 = mul nuw nsw i32 %8467, %7733, !dbg !333 + %8469 = add i32 %8468, %7734, !dbg !344 + %8470 = shl i32 %8469, 12, !dbg !345 + %8471 = sext i32 %8470 to i64, !dbg !299 + %8472 = getelementptr bfloat, ptr addrspace(1) %.pn1231595, i64 %8471, !dbg !299 + %8473 = getelementptr bfloat, ptr addrspace(1) %.pn1071596, i64 %8471, !dbg !299 + %8474 = getelementptr bfloat, ptr addrspace(1) %.pn911597, i64 %8471, !dbg !299 + %8475 = getelementptr bfloat, ptr addrspace(1) %.pn751598, i64 %8471, !dbg !299 + %8476 = shl i32 %8469, 7, !dbg !346 + %8477 = sext i32 %8476 to i64, !dbg !300 + %8478 = getelementptr bfloat, ptr addrspace(1) %.pn1871599, i64 %8477, !dbg !300 + %8479 = getelementptr bfloat, ptr addrspace(1) %.pn1711600, i64 %8477, !dbg !300 + %8480 = getelementptr bfloat, ptr addrspace(1) %.pn1551601, i64 %8477, !dbg !300 + %8481 = getelementptr bfloat, ptr addrspace(1) %.pn1391602, i64 %8477, !dbg !300 + %8482 = insertelement <2 x i32> poison, i32 %8469, i64 0, !dbg !326 + %8483 = shufflevector <2 x i32> %8482, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !326 + %8484 = add <2 x i32> %8483, %6154, !dbg !326 + %8485 = add i32 %8469, %.pn2151605, !dbg !326 + %8486 = add i32 %8469, %.pn2131606, !dbg !326 + %8487 = add i32 %8469, %.pn2111607, !dbg !326 + %8488 = add i32 %8469, %.pn2091608, !dbg !326 + %8489 = add i32 %8469, %.pn2071609, !dbg !326 + %8490 = add i32 %8469, %.pn2051610, !dbg !326 + %8491 = add i32 %8469, %.pn2031611, !dbg !326 + %8492 = add i32 %8469, %.pn2011612, !dbg !326 + %8493 = add i32 %8469, %.pn1991613, !dbg !326 + %8494 = add i32 %8469, %.pn1971614, !dbg !326 + %8495 = add i32 %8469, %.pn1951615, !dbg !326 + %8496 = add i32 %8469, %.pn1931616, !dbg !326 + %8497 = add i32 %8469, %.pn1911617, !dbg !326 + %8498 = add i32 %8469, %.pn1891618, !dbg !326 + %8499 = add i32 %8469, %6019, !dbg !326 + %8500 = add i32 %8469, %6020, !dbg !326 + %8501 = add i32 %8469, %6021, !dbg !326 + %8502 = add i32 %8469, %6022, !dbg !326 + %8503 = add i32 %8469, %6015, !dbg !326 + %8504 = add i32 %8469, %6016, !dbg !326 + %8505 = add i32 %8469, %6017, !dbg !326 + %8506 = add i32 %8469, %6018, !dbg !326 + %8507 = add i32 %6012, 1, !dbg !255 + %8508 = icmp sgt i32 %8507, 1, !dbg !255 + %8509 = select i1 %8508, i32 0, i32 %8507, !dbg !255 + %8510 = add i32 %6014, 1, !dbg !255 + %8511 = icmp sgt i32 %8510, 2, !dbg !255 + %8512 = select i1 %8511, i32 0, i32 %8510, !dbg !255 + %8513 = icmp slt i32 %8499, %18, !dbg !301 + %8514 = icmp slt i32 %8500, %18, !dbg !301 + %8515 = icmp slt i32 %8501, %18, !dbg !301 + %8516 = icmp slt i32 %8502, %18, !dbg !301 + %8517 = shl i32 %8512, 13, !dbg !293 + %8518 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %8517, !dbg !293 + %8519 = and i1 %6157, %8513, !dbg !255 + %8520 = and i1 %6157, %8514, !dbg !255 + %8521 = and i1 %6157, %8515, !dbg !255 + %8522 = and i1 %6157, %8516, !dbg !255 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !293 + %8523 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5264, !dbg !293 + %8524 = select i1 %8519, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %8523, ptr addrspace(1) %8472, i32 %8524) #3, !dbg !293 + %8525 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5267, !dbg !293 + %8526 = select i1 %8520, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8525, ptr addrspace(1) %8473, i32 %8526) #3, !dbg !293 + %8527 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5270, !dbg !293 + %8528 = select i1 %8521, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8527, ptr addrspace(1) %8474, i32 %8528) #3, !dbg !293 + %8529 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5273, !dbg !293 + %8530 = select i1 %8522, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8529, ptr addrspace(1) %8475, i32 %8530) #3, !dbg !293 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !293 + %8531 = extractelement <2 x i32> %8484, i64 0, !dbg !294 + %8532 = icmp slt i32 %8531, %18, !dbg !347 + %8533 = extractelement <2 x i32> %8484, i64 1, !dbg !294 + %8534 = icmp slt i32 %8533, %18, !dbg !347 + %8535 = icmp slt i32 %8485, %18, !dbg !347 + %8536 = icmp slt i32 %8486, %18, !dbg !347 + %8537 = icmp slt i32 %8487, %18, !dbg !347 + %8538 = icmp slt i32 %8488, %18, !dbg !347 + %8539 = icmp slt i32 %8489, %18, !dbg !347 + %8540 = icmp slt i32 %8490, %18, !dbg !347 + %8541 = icmp slt i32 %8491, %18, !dbg !347 + %8542 = icmp slt i32 %8492, %18, !dbg !347 + %8543 = icmp slt i32 %8493, %18, !dbg !347 + %8544 = icmp slt i32 %8494, %18, !dbg !347 + %8545 = icmp slt i32 %8495, %18, !dbg !347 + %8546 = icmp slt i32 %8496, %18, !dbg !347 + %8547 = icmp slt i32 %8497, %18, !dbg !347 + %8548 = icmp slt i32 %8498, %18, !dbg !347 + %8549 = sext i32 %8531 to i64, !dbg !294 + %8550 = getelementptr float, ptr addrspace(1) %5904, i64 %8549, !dbg !294 + %8551 = sext i32 %8533 to i64, !dbg !294 + %8552 = getelementptr float, ptr addrspace(1) %5904, i64 %8551, !dbg !294 + %8553 = sext i32 %8485 to i64, !dbg !294 + %8554 = getelementptr float, ptr addrspace(1) %5904, i64 %8553, !dbg !294 + %8555 = sext i32 %8486 to i64, !dbg !294 + %8556 = getelementptr float, ptr addrspace(1) %5904, i64 %8555, !dbg !294 + %8557 = sext i32 %8487 to i64, !dbg !294 + %8558 = getelementptr float, ptr addrspace(1) %5904, i64 %8557, !dbg !294 + %8559 = sext i32 %8488 to i64, !dbg !294 + %8560 = getelementptr float, ptr addrspace(1) %5904, i64 %8559, !dbg !294 + %8561 = sext i32 %8489 to i64, !dbg !294 + %8562 = getelementptr float, ptr addrspace(1) %5904, i64 %8561, !dbg !294 + %8563 = sext i32 %8490 to i64, !dbg !294 + %8564 = getelementptr float, ptr addrspace(1) %5904, i64 %8563, !dbg !294 + %8565 = sext i32 %8491 to i64, !dbg !294 + %8566 = getelementptr float, ptr addrspace(1) %5904, i64 %8565, !dbg !294 + %8567 = sext i32 %8492 to i64, !dbg !294 + %8568 = getelementptr float, ptr addrspace(1) %5904, i64 %8567, !dbg !294 + %8569 = sext i32 %8493 to i64, !dbg !294 + %8570 = getelementptr float, ptr addrspace(1) %5904, i64 %8569, !dbg !294 + %8571 = sext i32 %8494 to i64, !dbg !294 + %8572 = getelementptr float, ptr addrspace(1) %5904, i64 %8571, !dbg !294 + %8573 = sext i32 %8495 to i64, !dbg !294 + %8574 = getelementptr float, ptr addrspace(1) %5904, i64 %8573, !dbg !294 + %8575 = sext i32 %8496 to i64, !dbg !294 + %8576 = getelementptr float, ptr addrspace(1) %5904, i64 %8575, !dbg !294 + %8577 = sext i32 %8497 to i64, !dbg !294 + %8578 = getelementptr float, ptr addrspace(1) %5904, i64 %8577, !dbg !294 + %8579 = sext i32 %8498 to i64, !dbg !294 + %8580 = getelementptr float, ptr addrspace(1) %5904, i64 %8579, !dbg !294 + %8581 = shl i32 %8509, 6, !dbg !295 + %8582 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %8581, !dbg !295 + %8583 = and i1 %6157, %8532, !dbg !255 + %8584 = and i1 %6157, %8534, !dbg !255 + %8585 = and i1 %6157, %8535, !dbg !255 + %8586 = and i1 %6157, %8536, !dbg !255 + %8587 = and i1 %6157, %8537, !dbg !255 + %8588 = and i1 %6157, %8538, !dbg !255 + %8589 = and i1 %6157, %8539, !dbg !255 + %8590 = and i1 %6157, %8540, !dbg !255 + %8591 = and i1 %6157, %8541, !dbg !255 + %8592 = and i1 %6157, %8542, !dbg !255 + %8593 = and i1 %6157, %8543, !dbg !255 + %8594 = and i1 %6157, %8544, !dbg !255 + %8595 = and i1 %6157, %8545, !dbg !255 + %8596 = and i1 %6157, %8546, !dbg !255 + %8597 = and i1 %6157, %8547, !dbg !255 + %8598 = and i1 %6157, %8548, !dbg !255 + %8599 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5332, !dbg !295 + %8600 = select i1 %8583, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %8599, ptr addrspace(1) %8550, i32 %8600, i1 %5331) #3, !dbg !295 + %8601 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5334, !dbg !295 + %8602 = select i1 %8584, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8601, ptr addrspace(1) %8552, i32 %8602, i1 %5331) #3, !dbg !295 + %8603 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5336, !dbg !295 + %8604 = select i1 %8585, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8603, ptr addrspace(1) %8554, i32 %8604, i1 %5331) #3, !dbg !295 + %8605 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5339, !dbg !295 + %8606 = select i1 %8586, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8605, ptr addrspace(1) %8556, i32 %8606, i1 %5331) #3, !dbg !295 + %8607 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5342, !dbg !295 + %8608 = select i1 %8587, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8607, ptr addrspace(1) %8558, i32 %8608, i1 %5331) #3, !dbg !295 + %8609 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5345, !dbg !295 + %8610 = select i1 %8588, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8609, ptr addrspace(1) %8560, i32 %8610, i1 %5331) #3, !dbg !295 + %8611 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5348, !dbg !295 + %8612 = select i1 %8589, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8611, ptr addrspace(1) %8562, i32 %8612, i1 %5331) #3, !dbg !295 + %8613 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5351, !dbg !295 + %8614 = select i1 %8590, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8613, ptr addrspace(1) %8564, i32 %8614, i1 %5331) #3, !dbg !295 + %8615 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5354, !dbg !295 + %8616 = select i1 %8591, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8615, ptr addrspace(1) %8566, i32 %8616, i1 %5331) #3, !dbg !295 + %8617 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5357, !dbg !295 + %8618 = select i1 %8592, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8617, ptr addrspace(1) %8568, i32 %8618, i1 %5331) #3, !dbg !295 + %8619 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5360, !dbg !295 + %8620 = select i1 %8593, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8619, ptr addrspace(1) %8570, i32 %8620, i1 %5331) #3, !dbg !295 + %8621 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5363, !dbg !295 + %8622 = select i1 %8594, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8621, ptr addrspace(1) %8572, i32 %8622, i1 %5331) #3, !dbg !295 + %8623 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5366, !dbg !295 + %8624 = select i1 %8595, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8623, ptr addrspace(1) %8574, i32 %8624, i1 %5331) #3, !dbg !295 + %8625 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5369, !dbg !295 + %8626 = select i1 %8596, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8625, ptr addrspace(1) %8576, i32 %8626, i1 %5331) #3, !dbg !295 + %8627 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5372, !dbg !295 + %8628 = select i1 %8597, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8627, ptr addrspace(1) %8578, i32 %8628, i1 %5331) #3, !dbg !295 + %8629 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5375, !dbg !295 + %8630 = select i1 %8598, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8629, ptr addrspace(1) %8580, i32 %8630, i1 %5331) #3, !dbg !295 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !295 + %8631 = icmp slt i32 %8503, %18, !dbg !348 + %8632 = icmp slt i32 %8504, %18, !dbg !348 + %8633 = icmp slt i32 %8505, %18, !dbg !348 + %8634 = icmp slt i32 %8506, %18, !dbg !348 + %8635 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %8517, !dbg !296 + %8636 = and i1 %6157, %8631, !dbg !255 + %8637 = and i1 %6157, %8632, !dbg !255 + %8638 = and i1 %6157, %8633, !dbg !255 + %8639 = and i1 %6157, %8634, !dbg !255 + %8640 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5264, !dbg !296 + %8641 = select i1 %8636, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %8640, ptr addrspace(1) %8478, i32 %8641) #3, !dbg !296 + %8642 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5267, !dbg !296 + %8643 = select i1 %8637, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8642, ptr addrspace(1) %8479, i32 %8643) #3, !dbg !296 + %8644 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5270, !dbg !296 + %8645 = select i1 %8638, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8644, ptr addrspace(1) %8480, i32 %8645) #3, !dbg !296 + %8646 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5273, !dbg !296 + %8647 = select i1 %8639, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8646, ptr addrspace(1) %8481, i32 %8647) #3, !dbg !296 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !296 + %8648 = getelementptr float, ptr addrspace(1) %5905, i64 %8549, !dbg !297 + %8649 = getelementptr float, ptr addrspace(1) %5905, i64 %8551, !dbg !297 + %8650 = getelementptr float, ptr addrspace(1) %5905, i64 %8553, !dbg !297 + %8651 = getelementptr float, ptr addrspace(1) %5905, i64 %8555, !dbg !297 + %8652 = getelementptr float, ptr addrspace(1) %5905, i64 %8557, !dbg !297 + %8653 = getelementptr float, ptr addrspace(1) %5905, i64 %8559, !dbg !297 + %8654 = getelementptr float, ptr addrspace(1) %5905, i64 %8561, !dbg !297 + %8655 = getelementptr float, ptr addrspace(1) %5905, i64 %8563, !dbg !297 + %8656 = getelementptr float, ptr addrspace(1) %5905, i64 %8565, !dbg !297 + %8657 = getelementptr float, ptr addrspace(1) %5905, i64 %8567, !dbg !297 + %8658 = getelementptr float, ptr addrspace(1) %5905, i64 %8569, !dbg !297 + %8659 = getelementptr float, ptr addrspace(1) %5905, i64 %8571, !dbg !297 + %8660 = getelementptr float, ptr addrspace(1) %5905, i64 %8573, !dbg !297 + %8661 = getelementptr float, ptr addrspace(1) %5905, i64 %8575, !dbg !297 + %8662 = getelementptr float, ptr addrspace(1) %5905, i64 %8577, !dbg !297 + %8663 = getelementptr float, ptr addrspace(1) %5905, i64 %8579, !dbg !297 + %8664 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %8581, !dbg !298 + %8665 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5332, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %8665, ptr addrspace(1) %8648, i32 %8600, i1 %5331) #3, !dbg !298 + %8666 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5334, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8666, ptr addrspace(1) %8649, i32 %8602, i1 %5331) #3, !dbg !298 + %8667 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5336, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8667, ptr addrspace(1) %8650, i32 %8604, i1 %5331) #3, !dbg !298 + %8668 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5339, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8668, ptr addrspace(1) %8651, i32 %8606, i1 %5331) #3, !dbg !298 + %8669 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5342, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8669, ptr addrspace(1) %8652, i32 %8608, i1 %5331) #3, !dbg !298 + %8670 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5345, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8670, ptr addrspace(1) %8653, i32 %8610, i1 %5331) #3, !dbg !298 + %8671 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5348, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8671, ptr addrspace(1) %8654, i32 %8612, i1 %5331) #3, !dbg !298 + %8672 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5351, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8672, ptr addrspace(1) %8655, i32 %8614, i1 %5331) #3, !dbg !298 + %8673 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5354, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8673, ptr addrspace(1) %8656, i32 %8616, i1 %5331) #3, !dbg !298 + %8674 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5357, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8674, ptr addrspace(1) %8657, i32 %8618, i1 %5331) #3, !dbg !298 + %8675 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5360, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8675, ptr addrspace(1) %8658, i32 %8620, i1 %5331) #3, !dbg !298 + %8676 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5363, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8676, ptr addrspace(1) %8659, i32 %8622, i1 %5331) #3, !dbg !298 + %8677 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5366, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8677, ptr addrspace(1) %8660, i32 %8624, i1 %5331) #3, !dbg !298 + %8678 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5369, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8678, ptr addrspace(1) %8661, i32 %8626, i1 %5331) #3, !dbg !298 + %8679 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5372, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8679, ptr addrspace(1) %8662, i32 %8628, i1 %5331) #3, !dbg !298 + %8680 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5375, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8680, ptr addrspace(1) %8663, i32 %8630, i1 %5331) #3, !dbg !298 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !298 + %exitcond2186.not = icmp eq i32 %7724, %smax2185, !dbg !255 + br i1 %exitcond2186.not, label %._crit_edge1621, label %.lr.ph1620, !dbg !255 + +._crit_edge1621: ; preds = %__nv_exp2f.exit1335, %5761 + %8681 = phi float [ %5762, %5761 ], [ %8397, %__nv_exp2f.exit1335 ] + %8682 = phi float [ %5763, %5761 ], [ %8398, %__nv_exp2f.exit1335 ] + %8683 = phi float [ %5764, %5761 ], [ %8399, %__nv_exp2f.exit1335 ] + %8684 = phi float [ %5765, %5761 ], [ %8400, %__nv_exp2f.exit1335 ] + %8685 = phi float [ %5766, %5761 ], [ %8401, %__nv_exp2f.exit1335 ] + %8686 = phi float [ %5767, %5761 ], [ %8402, %__nv_exp2f.exit1335 ] + %8687 = phi float [ %5768, %5761 ], [ %8403, %__nv_exp2f.exit1335 ] + %8688 = phi float [ %5769, %5761 ], [ %8404, %__nv_exp2f.exit1335 ] + %8689 = phi float [ %5770, %5761 ], [ %8405, %__nv_exp2f.exit1335 ] + %8690 = phi float [ %5771, %5761 ], [ %8406, %__nv_exp2f.exit1335 ] + %8691 = phi float [ %5772, %5761 ], [ %8407, %__nv_exp2f.exit1335 ] + %8692 = phi float [ %5773, %5761 ], [ %8408, %__nv_exp2f.exit1335 ] + %8693 = phi float [ %5774, %5761 ], [ %8409, %__nv_exp2f.exit1335 ] + %8694 = phi float [ %5775, %5761 ], [ %8410, %__nv_exp2f.exit1335 ] + %8695 = phi float [ %5776, %5761 ], [ %8411, %__nv_exp2f.exit1335 ] + %8696 = phi float [ %5777, %5761 ], [ %8412, %__nv_exp2f.exit1335 ] + %8697 = phi float [ %5778, %5761 ], [ %8413, %__nv_exp2f.exit1335 ] + %8698 = phi float [ %5779, %5761 ], [ %8414, %__nv_exp2f.exit1335 ] + %8699 = phi float [ %5780, %5761 ], [ %8415, %__nv_exp2f.exit1335 ] + %8700 = phi float [ %5781, %5761 ], [ %8416, %__nv_exp2f.exit1335 ] + %8701 = phi float [ %5782, %5761 ], [ %8417, %__nv_exp2f.exit1335 ] + %8702 = phi float [ %5783, %5761 ], [ %8418, %__nv_exp2f.exit1335 ] + %8703 = phi float [ %5784, %5761 ], [ %8419, %__nv_exp2f.exit1335 ] + %8704 = phi float [ %5785, %5761 ], [ %8420, %__nv_exp2f.exit1335 ] + %8705 = phi float [ %5786, %5761 ], [ %8421, %__nv_exp2f.exit1335 ] + %8706 = phi float [ %5787, %5761 ], [ %8422, %__nv_exp2f.exit1335 ] + %8707 = phi float [ %5788, %5761 ], [ %8423, %__nv_exp2f.exit1335 ] + %8708 = phi float [ %5789, %5761 ], [ %8424, %__nv_exp2f.exit1335 ] + %8709 = phi float [ %5790, %5761 ], [ %8425, %__nv_exp2f.exit1335 ] + %8710 = phi float [ %5791, %5761 ], [ %8426, %__nv_exp2f.exit1335 ] + %8711 = phi float [ %5792, %5761 ], [ %8427, %__nv_exp2f.exit1335 ] + %8712 = phi float [ %5793, %5761 ], [ %8428, %__nv_exp2f.exit1335 ] + %8713 = phi float [ %5794, %5761 ], [ %8429, %__nv_exp2f.exit1335 ] + %8714 = phi float [ %5795, %5761 ], [ %8430, %__nv_exp2f.exit1335 ] + %8715 = phi float [ %5796, %5761 ], [ %8431, %__nv_exp2f.exit1335 ] + %8716 = phi float [ %5797, %5761 ], [ %8432, %__nv_exp2f.exit1335 ] + %8717 = phi float [ %5798, %5761 ], [ %8433, %__nv_exp2f.exit1335 ] + %8718 = phi float [ %5799, %5761 ], [ %8434, %__nv_exp2f.exit1335 ] + %8719 = phi float [ %5800, %5761 ], [ %8435, %__nv_exp2f.exit1335 ] + %8720 = phi float [ %5801, %5761 ], [ %8436, %__nv_exp2f.exit1335 ] + %8721 = phi float [ %5802, %5761 ], [ %8437, %__nv_exp2f.exit1335 ] + %8722 = phi float [ %5803, %5761 ], [ %8438, %__nv_exp2f.exit1335 ] + %8723 = phi float [ %5804, %5761 ], [ %8439, %__nv_exp2f.exit1335 ] + %8724 = phi float [ %5805, %5761 ], [ %8440, %__nv_exp2f.exit1335 ] + %8725 = phi float [ %5806, %5761 ], [ %8441, %__nv_exp2f.exit1335 ] + %8726 = phi float [ %5807, %5761 ], [ %8442, %__nv_exp2f.exit1335 ] + %8727 = phi float [ %5808, %5761 ], [ %8443, %__nv_exp2f.exit1335 ] + %8728 = phi float [ %5809, %5761 ], [ %8444, %__nv_exp2f.exit1335 ] + %8729 = phi float [ %5810, %5761 ], [ %8445, %__nv_exp2f.exit1335 ] + %8730 = phi float [ %5811, %5761 ], [ %8446, %__nv_exp2f.exit1335 ] + %8731 = phi float [ %5812, %5761 ], [ %8447, %__nv_exp2f.exit1335 ] + %8732 = phi float [ %5813, %5761 ], [ %8448, %__nv_exp2f.exit1335 ] + %8733 = phi float [ %5814, %5761 ], [ %8449, %__nv_exp2f.exit1335 ] + %8734 = phi float [ %5815, %5761 ], [ %8450, %__nv_exp2f.exit1335 ] + %8735 = phi float [ %5816, %5761 ], [ %8451, %__nv_exp2f.exit1335 ] + %8736 = phi float [ %5817, %5761 ], [ %8452, %__nv_exp2f.exit1335 ] + %8737 = phi float [ %5818, %5761 ], [ %8453, %__nv_exp2f.exit1335 ] + %8738 = phi float [ %5819, %5761 ], [ %8454, %__nv_exp2f.exit1335 ] + %8739 = phi float [ %5820, %5761 ], [ %8455, %__nv_exp2f.exit1335 ] + %8740 = phi float [ %5821, %5761 ], [ %8456, %__nv_exp2f.exit1335 ] + %8741 = phi float [ %5822, %5761 ], [ %8457, %__nv_exp2f.exit1335 ] + %8742 = phi float [ %5823, %5761 ], [ %8458, %__nv_exp2f.exit1335 ] + %8743 = phi float [ %5824, %5761 ], [ %8459, %__nv_exp2f.exit1335 ] + %8744 = phi float [ %5825, %5761 ], [ %8460, %__nv_exp2f.exit1335 ] + %8745 = phi float [ %5826, %5761 ], [ %7529, %__nv_exp2f.exit1335 ] + %8746 = phi float [ %5827, %5761 ], [ %7530, %__nv_exp2f.exit1335 ] + %8747 = phi float [ %5828, %5761 ], [ %7531, %__nv_exp2f.exit1335 ] + %8748 = phi float [ %5829, %5761 ], [ %7532, %__nv_exp2f.exit1335 ] + %8749 = phi float [ %5830, %5761 ], [ %7533, %__nv_exp2f.exit1335 ] + %8750 = phi float [ %5831, %5761 ], [ %7534, %__nv_exp2f.exit1335 ] + %8751 = phi float [ %5832, %5761 ], [ %7535, %__nv_exp2f.exit1335 ] + %8752 = phi float [ %5833, %5761 ], [ %7536, %__nv_exp2f.exit1335 ] + %8753 = phi float [ %5834, %5761 ], [ %7537, %__nv_exp2f.exit1335 ] + %8754 = phi float [ %5835, %5761 ], [ %7538, %__nv_exp2f.exit1335 ] + %8755 = phi float [ %5836, %5761 ], [ %7539, %__nv_exp2f.exit1335 ] + %8756 = phi float [ %5837, %5761 ], [ %7540, %__nv_exp2f.exit1335 ] + %8757 = phi float [ %5838, %5761 ], [ %7541, %__nv_exp2f.exit1335 ] + %8758 = phi float [ %5839, %5761 ], [ %7542, %__nv_exp2f.exit1335 ] + %8759 = phi float [ %5840, %5761 ], [ %7543, %__nv_exp2f.exit1335 ] + %8760 = phi float [ %5841, %5761 ], [ %7544, %__nv_exp2f.exit1335 ] + %8761 = phi float [ %5842, %5761 ], [ %7545, %__nv_exp2f.exit1335 ] + %8762 = phi float [ %5843, %5761 ], [ %7546, %__nv_exp2f.exit1335 ] + %8763 = phi float [ %5844, %5761 ], [ %7547, %__nv_exp2f.exit1335 ] + %8764 = phi float [ %5845, %5761 ], [ %7548, %__nv_exp2f.exit1335 ] + %8765 = phi float [ %5846, %5761 ], [ %7549, %__nv_exp2f.exit1335 ] + %8766 = phi float [ %5847, %5761 ], [ %7550, %__nv_exp2f.exit1335 ] + %8767 = phi float [ %5848, %5761 ], [ %7551, %__nv_exp2f.exit1335 ] + %8768 = phi float [ %5849, %5761 ], [ %7552, %__nv_exp2f.exit1335 ] + %8769 = phi float [ %5850, %5761 ], [ %7553, %__nv_exp2f.exit1335 ] + %8770 = phi float [ %5851, %5761 ], [ %7554, %__nv_exp2f.exit1335 ] + %8771 = phi float [ %5852, %5761 ], [ %7555, %__nv_exp2f.exit1335 ] + %8772 = phi float [ %5853, %5761 ], [ %7556, %__nv_exp2f.exit1335 ] + %8773 = phi float [ %5854, %5761 ], [ %7557, %__nv_exp2f.exit1335 ] + %8774 = phi float [ %5855, %5761 ], [ %7558, %__nv_exp2f.exit1335 ] + %8775 = phi float [ %5856, %5761 ], [ %7559, %__nv_exp2f.exit1335 ] + %8776 = phi float [ %5857, %5761 ], [ %7560, %__nv_exp2f.exit1335 ] + %8777 = phi float [ %5858, %5761 ], [ %7561, %__nv_exp2f.exit1335 ] + %8778 = phi float [ %5859, %5761 ], [ %7562, %__nv_exp2f.exit1335 ] + %8779 = phi float [ %5860, %5761 ], [ %7563, %__nv_exp2f.exit1335 ] + %8780 = phi float [ %5861, %5761 ], [ %7564, %__nv_exp2f.exit1335 ] + %8781 = phi float [ %5862, %5761 ], [ %7565, %__nv_exp2f.exit1335 ] + %8782 = phi float [ %5863, %5761 ], [ %7566, %__nv_exp2f.exit1335 ] + %8783 = phi float [ %5864, %5761 ], [ %7567, %__nv_exp2f.exit1335 ] + %8784 = phi float [ %5865, %5761 ], [ %7568, %__nv_exp2f.exit1335 ] + %8785 = phi float [ %5866, %5761 ], [ %7569, %__nv_exp2f.exit1335 ] + %8786 = phi float [ %5867, %5761 ], [ %7570, %__nv_exp2f.exit1335 ] + %8787 = phi float [ %5868, %5761 ], [ %7571, %__nv_exp2f.exit1335 ] + %8788 = phi float [ %5869, %5761 ], [ %7572, %__nv_exp2f.exit1335 ] + %8789 = phi float [ %5870, %5761 ], [ %7573, %__nv_exp2f.exit1335 ] + %8790 = phi float [ %5871, %5761 ], [ %7574, %__nv_exp2f.exit1335 ] + %8791 = phi float [ %5872, %5761 ], [ %7575, %__nv_exp2f.exit1335 ] + %8792 = phi float [ %5873, %5761 ], [ %7576, %__nv_exp2f.exit1335 ] + %8793 = phi float [ %5874, %5761 ], [ %7577, %__nv_exp2f.exit1335 ] + %8794 = phi float [ %5875, %5761 ], [ %7578, %__nv_exp2f.exit1335 ] + %8795 = phi float [ %5876, %5761 ], [ %7579, %__nv_exp2f.exit1335 ] + %8796 = phi float [ %5877, %5761 ], [ %7580, %__nv_exp2f.exit1335 ] + %8797 = phi float [ %5878, %5761 ], [ %7581, %__nv_exp2f.exit1335 ] + %8798 = phi float [ %5879, %5761 ], [ %7582, %__nv_exp2f.exit1335 ] + %8799 = phi float [ %5880, %5761 ], [ %7583, %__nv_exp2f.exit1335 ] + %8800 = phi float [ %5881, %5761 ], [ %7584, %__nv_exp2f.exit1335 ] + %8801 = phi float [ %5882, %5761 ], [ %7585, %__nv_exp2f.exit1335 ] + %8802 = phi float [ %5883, %5761 ], [ %7586, %__nv_exp2f.exit1335 ] + %8803 = phi float [ %5884, %5761 ], [ %7587, %__nv_exp2f.exit1335 ] + %8804 = phi float [ %5885, %5761 ], [ %7588, %__nv_exp2f.exit1335 ] + %8805 = phi float [ %5886, %5761 ], [ %7589, %__nv_exp2f.exit1335 ] + %8806 = phi float [ %5887, %5761 ], [ %7590, %__nv_exp2f.exit1335 ] + %8807 = phi float [ %5888, %5761 ], [ %7591, %__nv_exp2f.exit1335 ] + %8808 = phi float [ %5889, %5761 ], [ %7592, %__nv_exp2f.exit1335 ] + %8809 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %8745, float %8746, float %8747, float %8748, float %8749, float %8750, float %8751, float %8752, float %8753, float %8754, float %8755, float %8756, float %8757, float %8758, float %8759, float %8760, float %8761, float %8762, float %8763, float %8764, float %8765, float %8766, float %8767, float %8768, float %8769, float %8770, float %8771, float %8772, float %8773, float %8774, float %8775, float %8776, float %8777, float %8778, float %8779, float %8780, float %8781, float %8782, float %8783, float %8784, float %8785, float %8786, float %8787, float %8788, float %8789, float %8790, float %8791, float %8792, float %8793, float %8794, float %8795, float %8796, float %8797, float %8798, float %8799, float %8800, float %8801, float %8802, float %8803, float %8804, float %8805, float %8806, float %8807, float %8808, float %8681, float %8682, float %8683, float %8684, float %8685, float %8686, float %8687, float %8688, float %8689, float %8690, float %8691, float %8692, float %8693, float %8694, float %8695, float %8696, float %8697, float %8698, float %8699, float %8700, float %8701, float %8702, float %8703, float %8704, float %8705, float %8706, float %8707, float %8708, float %8709, float %8710, float %8711, float %8712, float %8713, float %8714, float %8715, float %8716, float %8717, float %8718, float %8719, float %8720, float %8721, float %8722, float %8723, float %8724, float %8725, float %8726, float %8727, float %8728, float %8729, float %8730, float %8731, float %8732, float %8733, float %8734, float %8735, float %8736, float %8737, float %8738, float %8739, float %8740, float %8741, float %8742, float %8743, float %8744) #3, !dbg !255 + %8810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 0, !dbg !255 + %8811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 1, !dbg !255 + %8812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 2, !dbg !255 + %8813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 3, !dbg !255 + %8814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 4, !dbg !255 + %8815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 5, !dbg !255 + %8816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 6, !dbg !255 + %8817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 7, !dbg !255 + %8818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 8, !dbg !255 + %8819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 9, !dbg !255 + %8820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 10, !dbg !255 + %8821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 11, !dbg !255 + %8822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 12, !dbg !255 + %8823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 13, !dbg !255 + %8824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 14, !dbg !255 + %8825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 15, !dbg !255 + %8826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 16, !dbg !255 + %8827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 17, !dbg !255 + %8828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 18, !dbg !255 + %8829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 19, !dbg !255 + %8830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 20, !dbg !255 + %8831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 21, !dbg !255 + %8832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 22, !dbg !255 + %8833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 23, !dbg !255 + %8834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 24, !dbg !255 + %8835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 25, !dbg !255 + %8836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 26, !dbg !255 + %8837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 27, !dbg !255 + %8838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 28, !dbg !255 + %8839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 29, !dbg !255 + %8840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 30, !dbg !255 + %8841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 31, !dbg !255 + %8842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 32, !dbg !255 + %8843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 33, !dbg !255 + %8844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 34, !dbg !255 + %8845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 35, !dbg !255 + %8846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 36, !dbg !255 + %8847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 37, !dbg !255 + %8848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 38, !dbg !255 + %8849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 39, !dbg !255 + %8850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 40, !dbg !255 + %8851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 41, !dbg !255 + %8852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 42, !dbg !255 + %8853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 43, !dbg !255 + %8854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 44, !dbg !255 + %8855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 45, !dbg !255 + %8856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 46, !dbg !255 + %8857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 47, !dbg !255 + %8858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 48, !dbg !255 + %8859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 49, !dbg !255 + %8860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 50, !dbg !255 + %8861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 51, !dbg !255 + %8862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 52, !dbg !255 + %8863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 53, !dbg !255 + %8864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 54, !dbg !255 + %8865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 55, !dbg !255 + %8866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 56, !dbg !255 + %8867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 57, !dbg !255 + %8868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 58, !dbg !255 + %8869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 59, !dbg !255 + %8870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 60, !dbg !255 + %8871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 61, !dbg !255 + %8872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 62, !dbg !255 + %8873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 63, !dbg !255 + %8874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 64, !dbg !255 + %8875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 65, !dbg !255 + %8876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 66, !dbg !255 + %8877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 67, !dbg !255 + %8878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 68, !dbg !255 + %8879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 69, !dbg !255 + %8880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 70, !dbg !255 + %8881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 71, !dbg !255 + %8882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 72, !dbg !255 + %8883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 73, !dbg !255 + %8884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 74, !dbg !255 + %8885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 75, !dbg !255 + %8886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 76, !dbg !255 + %8887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 77, !dbg !255 + %8888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 78, !dbg !255 + %8889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 79, !dbg !255 + %8890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 80, !dbg !255 + %8891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 81, !dbg !255 + %8892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 82, !dbg !255 + %8893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 83, !dbg !255 + %8894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 84, !dbg !255 + %8895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 85, !dbg !255 + %8896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 86, !dbg !255 + %8897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 87, !dbg !255 + %8898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 88, !dbg !255 + %8899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 89, !dbg !255 + %8900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 90, !dbg !255 + %8901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 91, !dbg !255 + %8902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 92, !dbg !255 + %8903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 93, !dbg !255 + %8904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 94, !dbg !255 + %8905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 95, !dbg !255 + %8906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 96, !dbg !255 + %8907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 97, !dbg !255 + %8908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 98, !dbg !255 + %8909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 99, !dbg !255 + %8910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 100, !dbg !255 + %8911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 101, !dbg !255 + %8912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 102, !dbg !255 + %8913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 103, !dbg !255 + %8914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 104, !dbg !255 + %8915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 105, !dbg !255 + %8916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 106, !dbg !255 + %8917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 107, !dbg !255 + %8918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 108, !dbg !255 + %8919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 109, !dbg !255 + %8920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 110, !dbg !255 + %8921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 111, !dbg !255 + %8922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 112, !dbg !255 + %8923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 113, !dbg !255 + %8924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 114, !dbg !255 + %8925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 115, !dbg !255 + %8926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 116, !dbg !255 + %8927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 117, !dbg !255 + %8928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 118, !dbg !255 + %8929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 119, !dbg !255 + %8930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 120, !dbg !255 + %8931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 121, !dbg !255 + %8932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 122, !dbg !255 + %8933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 123, !dbg !255 + %8934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 124, !dbg !255 + %8935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 125, !dbg !255 + %8936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 126, !dbg !255 + %8937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 127, !dbg !255 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !255 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !255 + %8938 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5553, !dbg !349 + %8939 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5554, !dbg !349 + %8940 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5555, !dbg !349 + %8941 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5556, !dbg !349 + %8942 = getelementptr bfloat, ptr addrspace(1) %8938, i64 %4911, !dbg !350 + %8943 = getelementptr bfloat, ptr addrspace(1) %8939, i64 %4911, !dbg !350 + %8944 = getelementptr bfloat, ptr addrspace(1) %8940, i64 %4911, !dbg !350 + %8945 = getelementptr bfloat, ptr addrspace(1) %8941, i64 %4911, !dbg !350 + %8946 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5557, !dbg !351 + %8947 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5558, !dbg !351 + %8948 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5559, !dbg !351 + %8949 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5560, !dbg !351 + %8950 = getelementptr bfloat, ptr addrspace(1) %8946, i64 %4911, !dbg !352 + %8951 = getelementptr bfloat, ptr addrspace(1) %8947, i64 %4911, !dbg !352 + %8952 = getelementptr bfloat, ptr addrspace(1) %8948, i64 %4911, !dbg !352 + %8953 = getelementptr bfloat, ptr addrspace(1) %8949, i64 %4911, !dbg !352 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5265, ptr addrspace(1) %8942, i32 %5570) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5268, ptr addrspace(1) %8943, i32 %5571) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5271, ptr addrspace(1) %8944, i32 %5572) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5274, ptr addrspace(1) %8945, i32 %5573) #3, !dbg !353 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !353 + %8954 = getelementptr float, ptr addrspace(1) %5904, i64 %5590, !dbg !354 + %8955 = getelementptr float, ptr addrspace(1) %5904, i64 %5591, !dbg !354 + %8956 = getelementptr float, ptr addrspace(1) %5904, i64 %5592, !dbg !354 + %8957 = getelementptr float, ptr addrspace(1) %5904, i64 %5593, !dbg !354 + %8958 = getelementptr float, ptr addrspace(1) %5904, i64 %5594, !dbg !354 + %8959 = getelementptr float, ptr addrspace(1) %5904, i64 %5595, !dbg !354 + %8960 = getelementptr float, ptr addrspace(1) %5904, i64 %5596, !dbg !354 + %8961 = getelementptr float, ptr addrspace(1) %5904, i64 %5597, !dbg !354 + %8962 = getelementptr float, ptr addrspace(1) %5904, i64 %5598, !dbg !354 + %8963 = getelementptr float, ptr addrspace(1) %5904, i64 %5599, !dbg !354 + %8964 = getelementptr float, ptr addrspace(1) %5904, i64 %5600, !dbg !354 + %8965 = getelementptr float, ptr addrspace(1) %5904, i64 %5601, !dbg !354 + %8966 = getelementptr float, ptr addrspace(1) %5904, i64 %5602, !dbg !354 + %8967 = getelementptr float, ptr addrspace(1) %5904, i64 %5603, !dbg !354 + %8968 = getelementptr float, ptr addrspace(1) %5904, i64 %5604, !dbg !354 + %8969 = getelementptr float, ptr addrspace(1) %5904, i64 %5605, !dbg !354 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5333, ptr addrspace(1) %8954, i32 %5622, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5335, ptr addrspace(1) %8955, i32 %5623, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5337, ptr addrspace(1) %8956, i32 %5624, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5340, ptr addrspace(1) %8957, i32 %5625, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5343, ptr addrspace(1) %8958, i32 %5626, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5346, ptr addrspace(1) %8959, i32 %5627, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5349, ptr addrspace(1) %8960, i32 %5628, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5352, ptr addrspace(1) %8961, i32 %5629, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5355, ptr addrspace(1) %8962, i32 %5630, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5358, ptr addrspace(1) %8963, i32 %5631, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5361, ptr addrspace(1) %8964, i32 %5632, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5364, ptr addrspace(1) %8965, i32 %5633, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5367, ptr addrspace(1) %8966, i32 %5634, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5370, ptr addrspace(1) %8967, i32 %5635, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5373, ptr addrspace(1) %8968, i32 %5636, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5376, ptr addrspace(1) %8969, i32 %5637, i1 %5331) #3, !dbg !355 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !355 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5378, ptr addrspace(1) %8950, i32 %5570) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5379, ptr addrspace(1) %8951, i32 %5571) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5380, ptr addrspace(1) %8952, i32 %5572) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5381, ptr addrspace(1) %8953, i32 %5573) #3, !dbg !356 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !356 + %8970 = getelementptr float, ptr addrspace(1) %5905, i64 %5590, !dbg !357 + %8971 = getelementptr float, ptr addrspace(1) %5905, i64 %5591, !dbg !357 + %8972 = getelementptr float, ptr addrspace(1) %5905, i64 %5592, !dbg !357 + %8973 = getelementptr float, ptr addrspace(1) %5905, i64 %5593, !dbg !357 + %8974 = getelementptr float, ptr addrspace(1) %5905, i64 %5594, !dbg !357 + %8975 = getelementptr float, ptr addrspace(1) %5905, i64 %5595, !dbg !357 + %8976 = getelementptr float, ptr addrspace(1) %5905, i64 %5596, !dbg !357 + %8977 = getelementptr float, ptr addrspace(1) %5905, i64 %5597, !dbg !357 + %8978 = getelementptr float, ptr addrspace(1) %5905, i64 %5598, !dbg !357 + %8979 = getelementptr float, ptr addrspace(1) %5905, i64 %5599, !dbg !357 + %8980 = getelementptr float, ptr addrspace(1) %5905, i64 %5600, !dbg !357 + %8981 = getelementptr float, ptr addrspace(1) %5905, i64 %5601, !dbg !357 + %8982 = getelementptr float, ptr addrspace(1) %5905, i64 %5602, !dbg !357 + %8983 = getelementptr float, ptr addrspace(1) %5905, i64 %5603, !dbg !357 + %8984 = getelementptr float, ptr addrspace(1) %5905, i64 %5604, !dbg !357 + %8985 = getelementptr float, ptr addrspace(1) %5905, i64 %5605, !dbg !357 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5382, ptr addrspace(1) %8970, i32 %5622, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5383, ptr addrspace(1) %8971, i32 %5623, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5384, ptr addrspace(1) %8972, i32 %5624, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5385, ptr addrspace(1) %8973, i32 %5625, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5386, ptr addrspace(1) %8974, i32 %5626, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5387, ptr addrspace(1) %8975, i32 %5627, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5388, ptr addrspace(1) %8976, i32 %5628, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5389, ptr addrspace(1) %8977, i32 %5629, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5390, ptr addrspace(1) %8978, i32 %5630, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5391, ptr addrspace(1) %8979, i32 %5631, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5392, ptr addrspace(1) %8980, i32 %5632, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5393, ptr addrspace(1) %8981, i32 %5633, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5394, ptr addrspace(1) %8982, i32 %5634, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5395, ptr addrspace(1) %8983, i32 %5635, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5396, ptr addrspace(1) %8984, i32 %5636, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5397, ptr addrspace(1) %8985, i32 %5637, i1 %5331) #3, !dbg !358 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !358 + %8986 = getelementptr i8, ptr addrspace(1) %8942, i64 524288, !dbg !359 + %8987 = getelementptr i8, ptr addrspace(1) %8943, i64 524288, !dbg !359 + %8988 = getelementptr i8, ptr addrspace(1) %8944, i64 524288, !dbg !359 + %8989 = getelementptr i8, ptr addrspace(1) %8945, i64 524288, !dbg !359 + %8990 = getelementptr i8, ptr addrspace(1) %8950, i64 16384, !dbg !360 + %8991 = getelementptr i8, ptr addrspace(1) %8951, i64 16384, !dbg !360 + %8992 = getelementptr i8, ptr addrspace(1) %8952, i64 16384, !dbg !360 + %8993 = getelementptr i8, ptr addrspace(1) %8953, i64 16384, !dbg !360 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5441, ptr addrspace(1) %8986, i32 %5667) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5443, ptr addrspace(1) %8987, i32 %5668) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5445, ptr addrspace(1) %8988, i32 %5669) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5447, ptr addrspace(1) %8989, i32 %5670) #3, !dbg !353 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !353 + %8994 = getelementptr float, ptr addrspace(1) %5904, i64 %5687, !dbg !354 + %8995 = getelementptr float, ptr addrspace(1) %5904, i64 %5688, !dbg !354 + %8996 = getelementptr float, ptr addrspace(1) %5904, i64 %5689, !dbg !354 + %8997 = getelementptr float, ptr addrspace(1) %5904, i64 %5690, !dbg !354 + %8998 = getelementptr float, ptr addrspace(1) %5904, i64 %5691, !dbg !354 + %8999 = getelementptr float, ptr addrspace(1) %5904, i64 %5692, !dbg !354 + %9000 = getelementptr float, ptr addrspace(1) %5904, i64 %5693, !dbg !354 + %9001 = getelementptr float, ptr addrspace(1) %5904, i64 %5694, !dbg !354 + %9002 = getelementptr float, ptr addrspace(1) %5904, i64 %5695, !dbg !354 + %9003 = getelementptr float, ptr addrspace(1) %5904, i64 %5696, !dbg !354 + %9004 = getelementptr float, ptr addrspace(1) %5904, i64 %5697, !dbg !354 + %9005 = getelementptr float, ptr addrspace(1) %5904, i64 %5698, !dbg !354 + %9006 = getelementptr float, ptr addrspace(1) %5904, i64 %5699, !dbg !354 + %9007 = getelementptr float, ptr addrspace(1) %5904, i64 %5700, !dbg !354 + %9008 = getelementptr float, ptr addrspace(1) %5904, i64 %5701, !dbg !354 + %9009 = getelementptr float, ptr addrspace(1) %5904, i64 %5702, !dbg !354 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5499, ptr addrspace(1) %8994, i32 %5719, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5501, ptr addrspace(1) %8995, i32 %5720, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5503, ptr addrspace(1) %8996, i32 %5721, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5505, ptr addrspace(1) %8997, i32 %5722, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5507, ptr addrspace(1) %8998, i32 %5723, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5509, ptr addrspace(1) %8999, i32 %5724, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5511, ptr addrspace(1) %9000, i32 %5725, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5513, ptr addrspace(1) %9001, i32 %5726, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5515, ptr addrspace(1) %9002, i32 %5727, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5517, ptr addrspace(1) %9003, i32 %5728, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5519, ptr addrspace(1) %9004, i32 %5729, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5521, ptr addrspace(1) %9005, i32 %5730, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5523, ptr addrspace(1) %9006, i32 %5731, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5525, ptr addrspace(1) %9007, i32 %5732, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5527, ptr addrspace(1) %9008, i32 %5733, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5529, ptr addrspace(1) %9009, i32 %5734, i1 %5331) #3, !dbg !355 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !355 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5531, ptr addrspace(1) %8990, i32 %5667) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5532, ptr addrspace(1) %8991, i32 %5668) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5533, ptr addrspace(1) %8992, i32 %5669) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5534, ptr addrspace(1) %8993, i32 %5670) #3, !dbg !356 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !356 + %9010 = getelementptr float, ptr addrspace(1) %5905, i64 %5687, !dbg !357 + %9011 = getelementptr float, ptr addrspace(1) %5905, i64 %5688, !dbg !357 + %9012 = getelementptr float, ptr addrspace(1) %5905, i64 %5689, !dbg !357 + %9013 = getelementptr float, ptr addrspace(1) %5905, i64 %5690, !dbg !357 + %9014 = getelementptr float, ptr addrspace(1) %5905, i64 %5691, !dbg !357 + %9015 = getelementptr float, ptr addrspace(1) %5905, i64 %5692, !dbg !357 + %9016 = getelementptr float, ptr addrspace(1) %5905, i64 %5693, !dbg !357 + %9017 = getelementptr float, ptr addrspace(1) %5905, i64 %5694, !dbg !357 + %9018 = getelementptr float, ptr addrspace(1) %5905, i64 %5695, !dbg !357 + %9019 = getelementptr float, ptr addrspace(1) %5905, i64 %5696, !dbg !357 + %9020 = getelementptr float, ptr addrspace(1) %5905, i64 %5697, !dbg !357 + %9021 = getelementptr float, ptr addrspace(1) %5905, i64 %5698, !dbg !357 + %9022 = getelementptr float, ptr addrspace(1) %5905, i64 %5699, !dbg !357 + %9023 = getelementptr float, ptr addrspace(1) %5905, i64 %5700, !dbg !357 + %9024 = getelementptr float, ptr addrspace(1) %5905, i64 %5701, !dbg !357 + %9025 = getelementptr float, ptr addrspace(1) %5905, i64 %5702, !dbg !357 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5535, ptr addrspace(1) %9010, i32 %5719, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5536, ptr addrspace(1) %9011, i32 %5720, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5537, ptr addrspace(1) %9012, i32 %5721, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5538, ptr addrspace(1) %9013, i32 %5722, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5539, ptr addrspace(1) %9014, i32 %5723, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5540, ptr addrspace(1) %9015, i32 %5724, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5541, ptr addrspace(1) %9016, i32 %5725, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5542, ptr addrspace(1) %9017, i32 %5726, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5543, ptr addrspace(1) %9018, i32 %5727, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5544, ptr addrspace(1) %9019, i32 %5728, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5545, ptr addrspace(1) %9020, i32 %5729, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5546, ptr addrspace(1) %9021, i32 %5730, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5547, ptr addrspace(1) %9022, i32 %5731, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5548, ptr addrspace(1) %9023, i32 %5732, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5549, ptr addrspace(1) %9024, i32 %5733, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5550, ptr addrspace(1) %9025, i32 %5734, i1 %5331) #3, !dbg !358 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !358 + br i1 %5561, label %.lr.ph1793, label %._crit_edge1794, !dbg !361 + +.lr.ph1793: ; preds = %._crit_edge1621, %__nv_exp2f.exit1239 + %.pn605.pn1791 = phi i32 [ %.pn6051775, %__nv_exp2f.exit1239 ], [ %5232, %._crit_edge1621 ] + %.pn607.pn1790 = phi i32 [ %.pn6071774, %__nv_exp2f.exit1239 ], [ %5230, %._crit_edge1621 ] + %.pn609.pn1789 = phi i32 [ %.pn6091773, %__nv_exp2f.exit1239 ], [ %5228, %._crit_edge1621 ] + %.pn611.pn1788 = phi i32 [ %.pn6111772, %__nv_exp2f.exit1239 ], [ %5226, %._crit_edge1621 ] + %.pn613.pn1787 = phi i32 [ %.pn6131771, %__nv_exp2f.exit1239 ], [ %5224, %._crit_edge1621 ] + %.pn615.pn1786 = phi i32 [ %.pn6151770, %__nv_exp2f.exit1239 ], [ %5222, %._crit_edge1621 ] + %.pn617.pn1785 = phi i32 [ %.pn6171769, %__nv_exp2f.exit1239 ], [ %5220, %._crit_edge1621 ] + %.pn619.pn1784 = phi i32 [ %.pn6191768, %__nv_exp2f.exit1239 ], [ %5218, %._crit_edge1621 ] + %.pn621.pn1783 = phi i32 [ %.pn6211767, %__nv_exp2f.exit1239 ], [ %5216, %._crit_edge1621 ] + %.pn623.pn1782 = phi i32 [ %.pn6231766, %__nv_exp2f.exit1239 ], [ %5214, %._crit_edge1621 ] + %.pn625.pn1781 = phi i32 [ %.pn6251765, %__nv_exp2f.exit1239 ], [ %5212, %._crit_edge1621 ] + %.pn627.pn1780 = phi i32 [ %.pn6271764, %__nv_exp2f.exit1239 ], [ %5210, %._crit_edge1621 ] + %.pn629.pn1779 = phi i32 [ %.pn6291763, %__nv_exp2f.exit1239 ], [ %5208, %._crit_edge1621 ] + %.pn631.pn1778 = phi i32 [ %.pn6311762, %__nv_exp2f.exit1239 ], [ %5207, %._crit_edge1621 ] + %.pn633.pn1777 = phi i32 [ %.pn6331761, %__nv_exp2f.exit1239 ], [ %5206, %._crit_edge1621 ] + %.pn635.pn1776 = phi i32 [ %.pn6351760, %__nv_exp2f.exit1239 ], [ %5205, %._crit_edge1621 ] + %9026 = phi i32 [ %9043, %__nv_exp2f.exit1239 ], [ -1, %._crit_edge1621 ] + %9027 = phi i32 [ %11065, %__nv_exp2f.exit1239 ], [ 1, %._crit_edge1621 ] + %9028 = phi i32 [ %9046, %__nv_exp2f.exit1239 ], [ -1, %._crit_edge1621 ] + %9029 = phi i32 [ %11068, %__nv_exp2f.exit1239 ], [ 1, %._crit_edge1621 ] + %.pn6051775 = phi i32 [ %11054, %__nv_exp2f.exit1239 ], [ %5654, %._crit_edge1621 ] + %.pn6071774 = phi i32 [ %11053, %__nv_exp2f.exit1239 ], [ %5653, %._crit_edge1621 ] + %.pn6091773 = phi i32 [ %11052, %__nv_exp2f.exit1239 ], [ %5652, %._crit_edge1621 ] + %.pn6111772 = phi i32 [ %11051, %__nv_exp2f.exit1239 ], [ %5651, %._crit_edge1621 ] + %.pn6131771 = phi i32 [ %11050, %__nv_exp2f.exit1239 ], [ %5650, %._crit_edge1621 ] + %.pn6151770 = phi i32 [ %11049, %__nv_exp2f.exit1239 ], [ %5649, %._crit_edge1621 ] + %.pn6171769 = phi i32 [ %11048, %__nv_exp2f.exit1239 ], [ %5648, %._crit_edge1621 ] + %.pn6191768 = phi i32 [ %11047, %__nv_exp2f.exit1239 ], [ %5647, %._crit_edge1621 ] + %.pn6211767 = phi i32 [ %11046, %__nv_exp2f.exit1239 ], [ %5646, %._crit_edge1621 ] + %.pn6231766 = phi i32 [ %11045, %__nv_exp2f.exit1239 ], [ %5645, %._crit_edge1621 ] + %.pn6251765 = phi i32 [ %11044, %__nv_exp2f.exit1239 ], [ %5644, %._crit_edge1621 ] + %.pn6271764 = phi i32 [ %11043, %__nv_exp2f.exit1239 ], [ %5643, %._crit_edge1621 ] + %.pn6291763 = phi i32 [ %11042, %__nv_exp2f.exit1239 ], [ %5642, %._crit_edge1621 ] + %.pn6311762 = phi i32 [ %11041, %__nv_exp2f.exit1239 ], [ %5641, %._crit_edge1621 ] + %.pn6331761 = phi i32 [ %11040, %__nv_exp2f.exit1239 ], [ %5640, %._crit_edge1621 ] + %.pn6351760 = phi i32 [ %11039, %__nv_exp2f.exit1239 ], [ %5639, %._crit_edge1621 ] + %9030 = phi i32 [ %11059, %__nv_exp2f.exit1239 ], [ %5655, %._crit_edge1621 ] + %9031 = phi i32 [ %11060, %__nv_exp2f.exit1239 ], [ %5656, %._crit_edge1621 ] + %9032 = phi i32 [ %11061, %__nv_exp2f.exit1239 ], [ %5657, %._crit_edge1621 ] + %9033 = phi i32 [ %11062, %__nv_exp2f.exit1239 ], [ %5658, %._crit_edge1621 ] + %.pn5551759 = phi ptr addrspace(1) [ %11038, %__nv_exp2f.exit1239 ], [ %8993, %._crit_edge1621 ] + %.pn5711758 = phi ptr addrspace(1) [ %11037, %__nv_exp2f.exit1239 ], [ %8992, %._crit_edge1621 ] + %.pn5871757 = phi ptr addrspace(1) [ %11036, %__nv_exp2f.exit1239 ], [ %8991, %._crit_edge1621 ] + %.pn6031756 = phi ptr addrspace(1) [ %11035, %__nv_exp2f.exit1239 ], [ %8990, %._crit_edge1621 ] + %9034 = phi i32 [ %11055, %__nv_exp2f.exit1239 ], [ %5655, %._crit_edge1621 ] + %9035 = phi i32 [ %11056, %__nv_exp2f.exit1239 ], [ %5656, %._crit_edge1621 ] + %9036 = phi i32 [ %11057, %__nv_exp2f.exit1239 ], [ %5657, %._crit_edge1621 ] + %9037 = phi i32 [ %11058, %__nv_exp2f.exit1239 ], [ %5658, %._crit_edge1621 ] + %.pn4911755 = phi ptr addrspace(1) [ %11032, %__nv_exp2f.exit1239 ], [ %8989, %._crit_edge1621 ] + %.pn5071754 = phi ptr addrspace(1) [ %11031, %__nv_exp2f.exit1239 ], [ %8988, %._crit_edge1621 ] + %.pn5231753 = phi ptr addrspace(1) [ %11030, %__nv_exp2f.exit1239 ], [ %8987, %._crit_edge1621 ] + %.pn5391752 = phi ptr addrspace(1) [ %11029, %__nv_exp2f.exit1239 ], [ %8986, %._crit_edge1621 ] + %.pn3491751 = phi float [ %10150, %__nv_exp2f.exit1239 ], [ %8873, %._crit_edge1621 ] + %.pn3511750 = phi float [ %10149, %__nv_exp2f.exit1239 ], [ %8872, %._crit_edge1621 ] + %.pn3531749 = phi float [ %10148, %__nv_exp2f.exit1239 ], [ %8871, %._crit_edge1621 ] + %.pn3551748 = phi float [ %10147, %__nv_exp2f.exit1239 ], [ %8870, %._crit_edge1621 ] + %.pn3571747 = phi float [ %10146, %__nv_exp2f.exit1239 ], [ %8869, %._crit_edge1621 ] + %.pn3591746 = phi float [ %10145, %__nv_exp2f.exit1239 ], [ %8868, %._crit_edge1621 ] + %.pn3611745 = phi float [ %10144, %__nv_exp2f.exit1239 ], [ %8867, %._crit_edge1621 ] + %.pn3631744 = phi float [ %10143, %__nv_exp2f.exit1239 ], [ %8866, %._crit_edge1621 ] + %.pn3651743 = phi float [ %10142, %__nv_exp2f.exit1239 ], [ %8865, %._crit_edge1621 ] + %.pn3671742 = phi float [ %10141, %__nv_exp2f.exit1239 ], [ %8864, %._crit_edge1621 ] + %.pn3691741 = phi float [ %10140, %__nv_exp2f.exit1239 ], [ %8863, %._crit_edge1621 ] + %.pn3711740 = phi float [ %10139, %__nv_exp2f.exit1239 ], [ %8862, %._crit_edge1621 ] + %.pn3731739 = phi float [ %10138, %__nv_exp2f.exit1239 ], [ %8861, %._crit_edge1621 ] + %.pn3751738 = phi float [ %10137, %__nv_exp2f.exit1239 ], [ %8860, %._crit_edge1621 ] + %.pn3771737 = phi float [ %10136, %__nv_exp2f.exit1239 ], [ %8859, %._crit_edge1621 ] + %.pn3791736 = phi float [ %10135, %__nv_exp2f.exit1239 ], [ %8858, %._crit_edge1621 ] + %.pn3811735 = phi float [ %10134, %__nv_exp2f.exit1239 ], [ %8857, %._crit_edge1621 ] + %.pn3831734 = phi float [ %10133, %__nv_exp2f.exit1239 ], [ %8856, %._crit_edge1621 ] + %.pn3851733 = phi float [ %10132, %__nv_exp2f.exit1239 ], [ %8855, %._crit_edge1621 ] + %.pn3871732 = phi float [ %10131, %__nv_exp2f.exit1239 ], [ %8854, %._crit_edge1621 ] + %.pn3891731 = phi float [ %10130, %__nv_exp2f.exit1239 ], [ %8853, %._crit_edge1621 ] + %.pn3911730 = phi float [ %10129, %__nv_exp2f.exit1239 ], [ %8852, %._crit_edge1621 ] + %.pn3931729 = phi float [ %10128, %__nv_exp2f.exit1239 ], [ %8851, %._crit_edge1621 ] + %.pn3951728 = phi float [ %10127, %__nv_exp2f.exit1239 ], [ %8850, %._crit_edge1621 ] + %.pn3971727 = phi float [ %10126, %__nv_exp2f.exit1239 ], [ %8849, %._crit_edge1621 ] + %.pn3991726 = phi float [ %10125, %__nv_exp2f.exit1239 ], [ %8848, %._crit_edge1621 ] + %.pn4011725 = phi float [ %10124, %__nv_exp2f.exit1239 ], [ %8847, %._crit_edge1621 ] + %.pn4031724 = phi float [ %10123, %__nv_exp2f.exit1239 ], [ %8846, %._crit_edge1621 ] + %.pn4051723 = phi float [ %10122, %__nv_exp2f.exit1239 ], [ %8845, %._crit_edge1621 ] + %.pn4071722 = phi float [ %10121, %__nv_exp2f.exit1239 ], [ %8844, %._crit_edge1621 ] + %.pn4091721 = phi float [ %10120, %__nv_exp2f.exit1239 ], [ %8843, %._crit_edge1621 ] + %.pn4111720 = phi float [ %10119, %__nv_exp2f.exit1239 ], [ %8842, %._crit_edge1621 ] + %.pn4131719 = phi float [ %10118, %__nv_exp2f.exit1239 ], [ %8841, %._crit_edge1621 ] + %.pn4151718 = phi float [ %10117, %__nv_exp2f.exit1239 ], [ %8840, %._crit_edge1621 ] + %.pn4171717 = phi float [ %10116, %__nv_exp2f.exit1239 ], [ %8839, %._crit_edge1621 ] + %.pn4191716 = phi float [ %10115, %__nv_exp2f.exit1239 ], [ %8838, %._crit_edge1621 ] + %.pn4211715 = phi float [ %10114, %__nv_exp2f.exit1239 ], [ %8837, %._crit_edge1621 ] + %.pn4231714 = phi float [ %10113, %__nv_exp2f.exit1239 ], [ %8836, %._crit_edge1621 ] + %.pn4251713 = phi float [ %10112, %__nv_exp2f.exit1239 ], [ %8835, %._crit_edge1621 ] + %.pn4271712 = phi float [ %10111, %__nv_exp2f.exit1239 ], [ %8834, %._crit_edge1621 ] + %.pn4291711 = phi float [ %10110, %__nv_exp2f.exit1239 ], [ %8833, %._crit_edge1621 ] + %.pn4311710 = phi float [ %10109, %__nv_exp2f.exit1239 ], [ %8832, %._crit_edge1621 ] + %.pn4331709 = phi float [ %10108, %__nv_exp2f.exit1239 ], [ %8831, %._crit_edge1621 ] + %.pn4351708 = phi float [ %10107, %__nv_exp2f.exit1239 ], [ %8830, %._crit_edge1621 ] + %.pn4371707 = phi float [ %10106, %__nv_exp2f.exit1239 ], [ %8829, %._crit_edge1621 ] + %.pn4391706 = phi float [ %10105, %__nv_exp2f.exit1239 ], [ %8828, %._crit_edge1621 ] + %.pn4411705 = phi float [ %10104, %__nv_exp2f.exit1239 ], [ %8827, %._crit_edge1621 ] + %.pn4431704 = phi float [ %10103, %__nv_exp2f.exit1239 ], [ %8826, %._crit_edge1621 ] + %.pn4451703 = phi float [ %10102, %__nv_exp2f.exit1239 ], [ %8825, %._crit_edge1621 ] + %.pn4471702 = phi float [ %10101, %__nv_exp2f.exit1239 ], [ %8824, %._crit_edge1621 ] + %.pn4491701 = phi float [ %10100, %__nv_exp2f.exit1239 ], [ %8823, %._crit_edge1621 ] + %.pn4511700 = phi float [ %10099, %__nv_exp2f.exit1239 ], [ %8822, %._crit_edge1621 ] + %.pn4531699 = phi float [ %10098, %__nv_exp2f.exit1239 ], [ %8821, %._crit_edge1621 ] + %.pn4551698 = phi float [ %10097, %__nv_exp2f.exit1239 ], [ %8820, %._crit_edge1621 ] + %.pn4571697 = phi float [ %10096, %__nv_exp2f.exit1239 ], [ %8819, %._crit_edge1621 ] + %.pn4591696 = phi float [ %10095, %__nv_exp2f.exit1239 ], [ %8818, %._crit_edge1621 ] + %.pn4611695 = phi float [ %10094, %__nv_exp2f.exit1239 ], [ %8817, %._crit_edge1621 ] + %.pn4631694 = phi float [ %10093, %__nv_exp2f.exit1239 ], [ %8816, %._crit_edge1621 ] + %.pn4651693 = phi float [ %10092, %__nv_exp2f.exit1239 ], [ %8815, %._crit_edge1621 ] + %.pn4671692 = phi float [ %10091, %__nv_exp2f.exit1239 ], [ %8814, %._crit_edge1621 ] + %.pn4691691 = phi float [ %10090, %__nv_exp2f.exit1239 ], [ %8813, %._crit_edge1621 ] + %.pn4711690 = phi float [ %10089, %__nv_exp2f.exit1239 ], [ %8812, %._crit_edge1621 ] + %.pn4731689 = phi float [ %10088, %__nv_exp2f.exit1239 ], [ %8811, %._crit_edge1621 ] + %.pn4751688 = phi float [ %10087, %__nv_exp2f.exit1239 ], [ %8810, %._crit_edge1621 ] + %.pn2211687 = phi float [ %11006, %__nv_exp2f.exit1239 ], [ %8937, %._crit_edge1621 ] + %.pn2231686 = phi float [ %11005, %__nv_exp2f.exit1239 ], [ %8936, %._crit_edge1621 ] + %.pn2251685 = phi float [ %11004, %__nv_exp2f.exit1239 ], [ %8935, %._crit_edge1621 ] + %.pn2271684 = phi float [ %11003, %__nv_exp2f.exit1239 ], [ %8934, %._crit_edge1621 ] + %.pn2291683 = phi float [ %11002, %__nv_exp2f.exit1239 ], [ %8933, %._crit_edge1621 ] + %.pn2311682 = phi float [ %11001, %__nv_exp2f.exit1239 ], [ %8932, %._crit_edge1621 ] + %.pn2331681 = phi float [ %11000, %__nv_exp2f.exit1239 ], [ %8931, %._crit_edge1621 ] + %.pn2351680 = phi float [ %10999, %__nv_exp2f.exit1239 ], [ %8930, %._crit_edge1621 ] + %.pn2371679 = phi float [ %10998, %__nv_exp2f.exit1239 ], [ %8929, %._crit_edge1621 ] + %.pn2391678 = phi float [ %10997, %__nv_exp2f.exit1239 ], [ %8928, %._crit_edge1621 ] + %.pn2411677 = phi float [ %10996, %__nv_exp2f.exit1239 ], [ %8927, %._crit_edge1621 ] + %.pn2431676 = phi float [ %10995, %__nv_exp2f.exit1239 ], [ %8926, %._crit_edge1621 ] + %.pn2451675 = phi float [ %10994, %__nv_exp2f.exit1239 ], [ %8925, %._crit_edge1621 ] + %.pn2471674 = phi float [ %10993, %__nv_exp2f.exit1239 ], [ %8924, %._crit_edge1621 ] + %.pn2491673 = phi float [ %10992, %__nv_exp2f.exit1239 ], [ %8923, %._crit_edge1621 ] + %.pn2511672 = phi float [ %10991, %__nv_exp2f.exit1239 ], [ %8922, %._crit_edge1621 ] + %.pn2531671 = phi float [ %10990, %__nv_exp2f.exit1239 ], [ %8921, %._crit_edge1621 ] + %.pn2551670 = phi float [ %10989, %__nv_exp2f.exit1239 ], [ %8920, %._crit_edge1621 ] + %.pn2571669 = phi float [ %10988, %__nv_exp2f.exit1239 ], [ %8919, %._crit_edge1621 ] + %.pn2591668 = phi float [ %10987, %__nv_exp2f.exit1239 ], [ %8918, %._crit_edge1621 ] + %.pn2611667 = phi float [ %10986, %__nv_exp2f.exit1239 ], [ %8917, %._crit_edge1621 ] + %.pn2631666 = phi float [ %10985, %__nv_exp2f.exit1239 ], [ %8916, %._crit_edge1621 ] + %.pn2651665 = phi float [ %10984, %__nv_exp2f.exit1239 ], [ %8915, %._crit_edge1621 ] + %.pn2671664 = phi float [ %10983, %__nv_exp2f.exit1239 ], [ %8914, %._crit_edge1621 ] + %.pn2691663 = phi float [ %10982, %__nv_exp2f.exit1239 ], [ %8913, %._crit_edge1621 ] + %.pn2711662 = phi float [ %10981, %__nv_exp2f.exit1239 ], [ %8912, %._crit_edge1621 ] + %.pn2731661 = phi float [ %10980, %__nv_exp2f.exit1239 ], [ %8911, %._crit_edge1621 ] + %.pn2751660 = phi float [ %10979, %__nv_exp2f.exit1239 ], [ %8910, %._crit_edge1621 ] + %.pn2771659 = phi float [ %10978, %__nv_exp2f.exit1239 ], [ %8909, %._crit_edge1621 ] + %.pn2791658 = phi float [ %10977, %__nv_exp2f.exit1239 ], [ %8908, %._crit_edge1621 ] + %.pn2811657 = phi float [ %10976, %__nv_exp2f.exit1239 ], [ %8907, %._crit_edge1621 ] + %.pn2831656 = phi float [ %10975, %__nv_exp2f.exit1239 ], [ %8906, %._crit_edge1621 ] + %.pn2851655 = phi float [ %10974, %__nv_exp2f.exit1239 ], [ %8905, %._crit_edge1621 ] + %.pn2871654 = phi float [ %10973, %__nv_exp2f.exit1239 ], [ %8904, %._crit_edge1621 ] + %.pn2891653 = phi float [ %10972, %__nv_exp2f.exit1239 ], [ %8903, %._crit_edge1621 ] + %.pn2911652 = phi float [ %10971, %__nv_exp2f.exit1239 ], [ %8902, %._crit_edge1621 ] + %.pn2931651 = phi float [ %10970, %__nv_exp2f.exit1239 ], [ %8901, %._crit_edge1621 ] + %.pn2951650 = phi float [ %10969, %__nv_exp2f.exit1239 ], [ %8900, %._crit_edge1621 ] + %.pn2971649 = phi float [ %10968, %__nv_exp2f.exit1239 ], [ %8899, %._crit_edge1621 ] + %.pn2991648 = phi float [ %10967, %__nv_exp2f.exit1239 ], [ %8898, %._crit_edge1621 ] + %.pn3011647 = phi float [ %10966, %__nv_exp2f.exit1239 ], [ %8897, %._crit_edge1621 ] + %.pn3031646 = phi float [ %10965, %__nv_exp2f.exit1239 ], [ %8896, %._crit_edge1621 ] + %.pn3051645 = phi float [ %10964, %__nv_exp2f.exit1239 ], [ %8895, %._crit_edge1621 ] + %.pn3071644 = phi float [ %10963, %__nv_exp2f.exit1239 ], [ %8894, %._crit_edge1621 ] + %.pn3091643 = phi float [ %10962, %__nv_exp2f.exit1239 ], [ %8893, %._crit_edge1621 ] + %.pn3111642 = phi float [ %10961, %__nv_exp2f.exit1239 ], [ %8892, %._crit_edge1621 ] + %.pn3131641 = phi float [ %10960, %__nv_exp2f.exit1239 ], [ %8891, %._crit_edge1621 ] + %.pn3151640 = phi float [ %10959, %__nv_exp2f.exit1239 ], [ %8890, %._crit_edge1621 ] + %.pn3171639 = phi float [ %10958, %__nv_exp2f.exit1239 ], [ %8889, %._crit_edge1621 ] + %.pn3191638 = phi float [ %10957, %__nv_exp2f.exit1239 ], [ %8888, %._crit_edge1621 ] + %.pn3211637 = phi float [ %10956, %__nv_exp2f.exit1239 ], [ %8887, %._crit_edge1621 ] + %.pn3231636 = phi float [ %10955, %__nv_exp2f.exit1239 ], [ %8886, %._crit_edge1621 ] + %.pn3251635 = phi float [ %10954, %__nv_exp2f.exit1239 ], [ %8885, %._crit_edge1621 ] + %.pn3271634 = phi float [ %10953, %__nv_exp2f.exit1239 ], [ %8884, %._crit_edge1621 ] + %.pn3291633 = phi float [ %10952, %__nv_exp2f.exit1239 ], [ %8883, %._crit_edge1621 ] + %.pn3311632 = phi float [ %10951, %__nv_exp2f.exit1239 ], [ %8882, %._crit_edge1621 ] + %.pn3331631 = phi float [ %10950, %__nv_exp2f.exit1239 ], [ %8881, %._crit_edge1621 ] + %.pn3351630 = phi float [ %10949, %__nv_exp2f.exit1239 ], [ %8880, %._crit_edge1621 ] + %.pn3371629 = phi float [ %10948, %__nv_exp2f.exit1239 ], [ %8879, %._crit_edge1621 ] + %.pn3391628 = phi float [ %10947, %__nv_exp2f.exit1239 ], [ %8878, %._crit_edge1621 ] + %.pn3411627 = phi float [ %10946, %__nv_exp2f.exit1239 ], [ %8877, %._crit_edge1621 ] + %.pn3431626 = phi float [ %10945, %__nv_exp2f.exit1239 ], [ %8876, %._crit_edge1621 ] + %.pn3451625 = phi float [ %10944, %__nv_exp2f.exit1239 ], [ %8875, %._crit_edge1621 ] + %.pn3471624 = phi float [ %10943, %__nv_exp2f.exit1239 ], [ %8874, %._crit_edge1621 ] + %9038 = phi i32 [ %11007, %__nv_exp2f.exit1239 ], [ 0, %._crit_edge1621 ] + %9039 = icmp slt i32 %9038, %5735, !dbg !361 + %9040 = icmp slt i32 %9038, %5736, !dbg !361 + %9041 = add i32 %9026, 1, !dbg !361 + %9042 = icmp sgt i32 %9041, 1, !dbg !361 + %9043 = select i1 %9042, i32 0, i32 %9041, !dbg !361 + %9044 = add i32 %9028, 1, !dbg !361 + %9045 = icmp sgt i32 %9044, 2, !dbg !361 + %9046 = select i1 %9045, i32 0, i32 %9044, !dbg !361 + %9047 = icmp slt i32 %.pn635.pn1776, %18, !dbg !362 + %9048 = icmp slt i32 %.pn633.pn1777, %18, !dbg !362 + %9049 = icmp slt i32 %.pn631.pn1778, %18, !dbg !362 + %9050 = icmp slt i32 %.pn629.pn1779, %18, !dbg !362 + %9051 = icmp slt i32 %.pn627.pn1780, %18, !dbg !362 + %9052 = icmp slt i32 %.pn625.pn1781, %18, !dbg !362 + %9053 = icmp slt i32 %.pn623.pn1782, %18, !dbg !362 + %9054 = icmp slt i32 %.pn621.pn1783, %18, !dbg !362 + %9055 = icmp slt i32 %.pn619.pn1784, %18, !dbg !362 + %9056 = icmp slt i32 %.pn617.pn1785, %18, !dbg !362 + %9057 = icmp slt i32 %.pn615.pn1786, %18, !dbg !362 + %9058 = icmp slt i32 %.pn613.pn1787, %18, !dbg !362 + %9059 = icmp slt i32 %.pn611.pn1788, %18, !dbg !362 + %9060 = icmp slt i32 %.pn609.pn1789, %18, !dbg !362 + %9061 = icmp slt i32 %.pn607.pn1790, %18, !dbg !362 + %9062 = icmp slt i32 %.pn605.pn1791, %18, !dbg !362 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !353 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !353 + %9063 = shl i32 %9046, 13, !dbg !353 + %9064 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %9063, !dbg !353 + %9065 = shl i32 %9043, 6, !dbg !355 + %9066 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %9065, !dbg !355 + %9067 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5332, !dbg !355 + %9068 = load float, ptr addrspace(3) %9067, align 8, !dbg !355 + %9069 = getelementptr inbounds nuw i8, ptr addrspace(3) %9067, i32 4, !dbg !355 + %9070 = load float, ptr addrspace(3) %9069, align 4, !dbg !355 + %9071 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5336, !dbg !355 + %9072 = load float, ptr addrspace(3) %9071, align 8, !dbg !355 + %9073 = getelementptr inbounds nuw i8, ptr addrspace(3) %9071, i32 4, !dbg !355 + %9074 = load float, ptr addrspace(3) %9073, align 4, !dbg !355 + %9075 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5342, !dbg !355 + %9076 = load float, ptr addrspace(3) %9075, align 8, !dbg !355 + %9077 = getelementptr inbounds nuw i8, ptr addrspace(3) %9075, i32 4, !dbg !355 + %9078 = load float, ptr addrspace(3) %9077, align 4, !dbg !355 + %9079 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5348, !dbg !355 + %9080 = load float, ptr addrspace(3) %9079, align 8, !dbg !355 + %9081 = getelementptr inbounds nuw i8, ptr addrspace(3) %9079, i32 4, !dbg !355 + %9082 = load float, ptr addrspace(3) %9081, align 4, !dbg !355 + %9083 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5354, !dbg !355 + %9084 = load float, ptr addrspace(3) %9083, align 8, !dbg !355 + %9085 = getelementptr inbounds nuw i8, ptr addrspace(3) %9083, i32 4, !dbg !355 + %9086 = load float, ptr addrspace(3) %9085, align 4, !dbg !355 + %9087 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5360, !dbg !355 + %9088 = load float, ptr addrspace(3) %9087, align 8, !dbg !355 + %9089 = getelementptr inbounds nuw i8, ptr addrspace(3) %9087, i32 4, !dbg !355 + %9090 = load float, ptr addrspace(3) %9089, align 4, !dbg !355 + %9091 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5366, !dbg !355 + %9092 = load float, ptr addrspace(3) %9091, align 8, !dbg !355 + %9093 = getelementptr inbounds nuw i8, ptr addrspace(3) %9091, i32 4, !dbg !355 + %9094 = load float, ptr addrspace(3) %9093, align 4, !dbg !355 + %9095 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5372, !dbg !355 + %9096 = load float, ptr addrspace(3) %9095, align 8, !dbg !355 + %9097 = getelementptr inbounds nuw i8, ptr addrspace(3) %9095, i32 4, !dbg !355 + %9098 = load float, ptr addrspace(3) %9097, align 4, !dbg !355 + %9099 = fcmp oeq float %9068, 0xFFF0000000000000, !dbg !363 + %9100 = fcmp oeq float %9070, 0xFFF0000000000000, !dbg !363 + %9101 = fcmp oeq float %9072, 0xFFF0000000000000, !dbg !363 + %9102 = fcmp oeq float %9074, 0xFFF0000000000000, !dbg !363 + %9103 = fcmp oeq float %9076, 0xFFF0000000000000, !dbg !363 + %9104 = fcmp oeq float %9078, 0xFFF0000000000000, !dbg !363 + %9105 = fcmp oeq float %9080, 0xFFF0000000000000, !dbg !363 + %9106 = fcmp oeq float %9082, 0xFFF0000000000000, !dbg !363 + %9107 = fcmp oeq float %9084, 0xFFF0000000000000, !dbg !363 + %9108 = fcmp oeq float %9086, 0xFFF0000000000000, !dbg !363 + %9109 = fcmp oeq float %9088, 0xFFF0000000000000, !dbg !363 + %9110 = fcmp oeq float %9090, 0xFFF0000000000000, !dbg !363 + %9111 = fcmp oeq float %9092, 0xFFF0000000000000, !dbg !363 + %9112 = fcmp oeq float %9094, 0xFFF0000000000000, !dbg !363 + %9113 = fcmp oeq float %9096, 0xFFF0000000000000, !dbg !363 + %9114 = fcmp oeq float %9098, 0xFFF0000000000000, !dbg !363 + %9115 = select i1 %9099, float 0.000000e+00, float %9068, !dbg !364 + %9116 = select i1 %9100, float 0.000000e+00, float %9070, !dbg !364 + %9117 = select i1 %9101, float 0.000000e+00, float %9072, !dbg !364 + %9118 = select i1 %9102, float 0.000000e+00, float %9074, !dbg !364 + %9119 = select i1 %9103, float 0.000000e+00, float %9076, !dbg !364 + %9120 = select i1 %9104, float 0.000000e+00, float %9078, !dbg !364 + %9121 = select i1 %9105, float 0.000000e+00, float %9080, !dbg !364 + %9122 = select i1 %9106, float 0.000000e+00, float %9082, !dbg !364 + %9123 = select i1 %9107, float 0.000000e+00, float %9084, !dbg !364 + %9124 = select i1 %9108, float 0.000000e+00, float %9086, !dbg !364 + %9125 = select i1 %9109, float 0.000000e+00, float %9088, !dbg !364 + %9126 = select i1 %9110, float 0.000000e+00, float %9090, !dbg !364 + %9127 = select i1 %9111, float 0.000000e+00, float %9092, !dbg !364 + %9128 = select i1 %9112, float 0.000000e+00, float %9094, !dbg !364 + %9129 = select i1 %9113, float 0.000000e+00, float %9096, !dbg !364 + %9130 = select i1 %9114, float 0.000000e+00, float %9098, !dbg !364 + %9131 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !365 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !365 + %9132 = shl i32 %9131, 11, !dbg !365 + %9133 = and i32 %9132, 8192, !dbg !365 + %9134 = add i32 %9133, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9135 = lshr exact i32 %9134, 4, !dbg !365 + %9136 = and i32 %9135, 16383, !dbg !365 + %9137 = zext nneg i32 %9136 to i64, !dbg !365 + %9138 = or disjoint i64 %9137, 4611686293372403712, !dbg !365 + %9139 = ptrtoint ptr addrspace(3) %9064 to i32, !dbg !365 + %9140 = lshr exact i32 %9139, 4, !dbg !365 + %9141 = and i32 %9140, 16383, !dbg !365 + %9142 = zext nneg i32 %9141 to i64, !dbg !365 + %9143 = or disjoint i64 %9142, 4611686293338849280, !dbg !365 + %9144 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %9138, i64 %9143) #3, !dbg !365 + %9145 = or disjoint i32 %9133, 32, !dbg !365 + %9146 = add i32 %9145, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9147 = lshr exact i32 %9146, 4, !dbg !365 + %9148 = and i32 %9147, 16383, !dbg !365 + %9149 = zext nneg i32 %9148 to i64, !dbg !365 + %9150 = or disjoint i64 %9149, 4611686293372403712, !dbg !365 + %9151 = add i32 %9139, 32, !dbg !365 + %9152 = lshr exact i32 %9151, 4, !dbg !365 + %9153 = and i32 %9152, 16383, !dbg !365 + %9154 = zext nneg i32 %9153 to i64, !dbg !365 + %9155 = or disjoint i64 %9154, 4611686293338849280, !dbg !365 + %9156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 0, !dbg !365 + %9157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 1, !dbg !365 + %9158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 2, !dbg !365 + %9159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 3, !dbg !365 + %9160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 4, !dbg !365 + %9161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 5, !dbg !365 + %9162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 6, !dbg !365 + %9163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 7, !dbg !365 + %9164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 8, !dbg !365 + %9165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 9, !dbg !365 + %9166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 10, !dbg !365 + %9167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 11, !dbg !365 + %9168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 12, !dbg !365 + %9169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 13, !dbg !365 + %9170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 14, !dbg !365 + %9171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 15, !dbg !365 + %9172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 16, !dbg !365 + %9173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 17, !dbg !365 + %9174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 18, !dbg !365 + %9175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 19, !dbg !365 + %9176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 20, !dbg !365 + %9177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 21, !dbg !365 + %9178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 22, !dbg !365 + %9179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 23, !dbg !365 + %9180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 24, !dbg !365 + %9181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 25, !dbg !365 + %9182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 26, !dbg !365 + %9183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 27, !dbg !365 + %9184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 28, !dbg !365 + %9185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 29, !dbg !365 + %9186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 30, !dbg !365 + %9187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 31, !dbg !365 + %9188 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9156, float %9157, float %9158, float %9159, float %9160, float %9161, float %9162, float %9163, float %9164, float %9165, float %9166, float %9167, float %9168, float %9169, float %9170, float %9171, float %9172, float %9173, float %9174, float %9175, float %9176, float %9177, float %9178, float %9179, float %9180, float %9181, float %9182, float %9183, float %9184, float %9185, float %9186, float %9187, i64 %9150, i64 %9155, i1 true) #3, !dbg !365 + %9189 = or disjoint i32 %9133, 64, !dbg !365 + %9190 = add i32 %9189, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9191 = lshr exact i32 %9190, 4, !dbg !365 + %9192 = and i32 %9191, 16383, !dbg !365 + %9193 = zext nneg i32 %9192 to i64, !dbg !365 + %9194 = or disjoint i64 %9193, 4611686293372403712, !dbg !365 + %9195 = add i32 %9139, 64, !dbg !365 + %9196 = lshr exact i32 %9195, 4, !dbg !365 + %9197 = and i32 %9196, 16383, !dbg !365 + %9198 = zext nneg i32 %9197 to i64, !dbg !365 + %9199 = or disjoint i64 %9198, 4611686293338849280, !dbg !365 + %9200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 0, !dbg !365 + %9201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 1, !dbg !365 + %9202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 2, !dbg !365 + %9203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 3, !dbg !365 + %9204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 4, !dbg !365 + %9205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 5, !dbg !365 + %9206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 6, !dbg !365 + %9207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 7, !dbg !365 + %9208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 8, !dbg !365 + %9209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 9, !dbg !365 + %9210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 10, !dbg !365 + %9211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 11, !dbg !365 + %9212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 12, !dbg !365 + %9213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 13, !dbg !365 + %9214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 14, !dbg !365 + %9215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 15, !dbg !365 + %9216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 16, !dbg !365 + %9217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 17, !dbg !365 + %9218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 18, !dbg !365 + %9219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 19, !dbg !365 + %9220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 20, !dbg !365 + %9221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 21, !dbg !365 + %9222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 22, !dbg !365 + %9223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 23, !dbg !365 + %9224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 24, !dbg !365 + %9225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 25, !dbg !365 + %9226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 26, !dbg !365 + %9227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 27, !dbg !365 + %9228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 28, !dbg !365 + %9229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 29, !dbg !365 + %9230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 30, !dbg !365 + %9231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 31, !dbg !365 + %9232 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9200, float %9201, float %9202, float %9203, float %9204, float %9205, float %9206, float %9207, float %9208, float %9209, float %9210, float %9211, float %9212, float %9213, float %9214, float %9215, float %9216, float %9217, float %9218, float %9219, float %9220, float %9221, float %9222, float %9223, float %9224, float %9225, float %9226, float %9227, float %9228, float %9229, float %9230, float %9231, i64 %9194, i64 %9199, i1 true) #3, !dbg !365 + %9233 = or disjoint i32 %9133, 96, !dbg !365 + %9234 = add i32 %9233, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9235 = lshr exact i32 %9234, 4, !dbg !365 + %9236 = and i32 %9235, 16383, !dbg !365 + %9237 = zext nneg i32 %9236 to i64, !dbg !365 + %9238 = or disjoint i64 %9237, 4611686293372403712, !dbg !365 + %9239 = add i32 %9139, 96, !dbg !365 + %9240 = lshr exact i32 %9239, 4, !dbg !365 + %9241 = and i32 %9240, 16383, !dbg !365 + %9242 = zext nneg i32 %9241 to i64, !dbg !365 + %9243 = or disjoint i64 %9242, 4611686293338849280, !dbg !365 + %9244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 0, !dbg !365 + %9245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 1, !dbg !365 + %9246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 2, !dbg !365 + %9247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 3, !dbg !365 + %9248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 4, !dbg !365 + %9249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 5, !dbg !365 + %9250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 6, !dbg !365 + %9251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 7, !dbg !365 + %9252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 8, !dbg !365 + %9253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 9, !dbg !365 + %9254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 10, !dbg !365 + %9255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 11, !dbg !365 + %9256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 12, !dbg !365 + %9257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 13, !dbg !365 + %9258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 14, !dbg !365 + %9259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 15, !dbg !365 + %9260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 16, !dbg !365 + %9261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 17, !dbg !365 + %9262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 18, !dbg !365 + %9263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 19, !dbg !365 + %9264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 20, !dbg !365 + %9265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 21, !dbg !365 + %9266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 22, !dbg !365 + %9267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 23, !dbg !365 + %9268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 24, !dbg !365 + %9269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 25, !dbg !365 + %9270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 26, !dbg !365 + %9271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 27, !dbg !365 + %9272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 28, !dbg !365 + %9273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 29, !dbg !365 + %9274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 30, !dbg !365 + %9275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 31, !dbg !365 + %9276 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9244, float %9245, float %9246, float %9247, float %9248, float %9249, float %9250, float %9251, float %9252, float %9253, float %9254, float %9255, float %9256, float %9257, float %9258, float %9259, float %9260, float %9261, float %9262, float %9263, float %9264, float %9265, float %9266, float %9267, float %9268, float %9269, float %9270, float %9271, float %9272, float %9273, float %9274, float %9275, i64 %9238, i64 %9243, i1 true) #3, !dbg !365 + %9277 = or disjoint i32 %9133, 16384, !dbg !365 + %9278 = add i32 %9277, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9279 = lshr exact i32 %9278, 4, !dbg !365 + %9280 = and i32 %9279, 16383, !dbg !365 + %9281 = zext nneg i32 %9280 to i64, !dbg !365 + %9282 = or disjoint i64 %9281, 4611686293372403712, !dbg !365 + %9283 = add i32 %9139, 8192, !dbg !365 + %9284 = lshr exact i32 %9283, 4, !dbg !365 + %9285 = and i32 %9284, 16383, !dbg !365 + %9286 = zext nneg i32 %9285 to i64, !dbg !365 + %9287 = or disjoint i64 %9286, 4611686293338849280, !dbg !365 + %9288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 0, !dbg !365 + %9289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 1, !dbg !365 + %9290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 2, !dbg !365 + %9291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 3, !dbg !365 + %9292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 4, !dbg !365 + %9293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 5, !dbg !365 + %9294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 6, !dbg !365 + %9295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 7, !dbg !365 + %9296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 8, !dbg !365 + %9297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 9, !dbg !365 + %9298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 10, !dbg !365 + %9299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 11, !dbg !365 + %9300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 12, !dbg !365 + %9301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 13, !dbg !365 + %9302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 14, !dbg !365 + %9303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 15, !dbg !365 + %9304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 16, !dbg !365 + %9305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 17, !dbg !365 + %9306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 18, !dbg !365 + %9307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 19, !dbg !365 + %9308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 20, !dbg !365 + %9309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 21, !dbg !365 + %9310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 22, !dbg !365 + %9311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 23, !dbg !365 + %9312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 24, !dbg !365 + %9313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 25, !dbg !365 + %9314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 26, !dbg !365 + %9315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 27, !dbg !365 + %9316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 28, !dbg !365 + %9317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 29, !dbg !365 + %9318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 30, !dbg !365 + %9319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 31, !dbg !365 + %9320 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9288, float %9289, float %9290, float %9291, float %9292, float %9293, float %9294, float %9295, float %9296, float %9297, float %9298, float %9299, float %9300, float %9301, float %9302, float %9303, float %9304, float %9305, float %9306, float %9307, float %9308, float %9309, float %9310, float %9311, float %9312, float %9313, float %9314, float %9315, float %9316, float %9317, float %9318, float %9319, i64 %9282, i64 %9287, i1 true) #3, !dbg !365 + %9321 = or disjoint i32 %9133, 16416, !dbg !365 + %9322 = add i32 %9321, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9323 = lshr exact i32 %9322, 4, !dbg !365 + %9324 = and i32 %9323, 16383, !dbg !365 + %9325 = zext nneg i32 %9324 to i64, !dbg !365 + %9326 = or disjoint i64 %9325, 4611686293372403712, !dbg !365 + %9327 = add i32 %9139, 8224, !dbg !365 + %9328 = lshr exact i32 %9327, 4, !dbg !365 + %9329 = and i32 %9328, 16383, !dbg !365 + %9330 = zext nneg i32 %9329 to i64, !dbg !365 + %9331 = or disjoint i64 %9330, 4611686293338849280, !dbg !365 + %9332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 0, !dbg !365 + %9333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 1, !dbg !365 + %9334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 2, !dbg !365 + %9335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 3, !dbg !365 + %9336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 4, !dbg !365 + %9337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 5, !dbg !365 + %9338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 6, !dbg !365 + %9339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 7, !dbg !365 + %9340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 8, !dbg !365 + %9341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 9, !dbg !365 + %9342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 10, !dbg !365 + %9343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 11, !dbg !365 + %9344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 12, !dbg !365 + %9345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 13, !dbg !365 + %9346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 14, !dbg !365 + %9347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 15, !dbg !365 + %9348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 16, !dbg !365 + %9349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 17, !dbg !365 + %9350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 18, !dbg !365 + %9351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 19, !dbg !365 + %9352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 20, !dbg !365 + %9353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 21, !dbg !365 + %9354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 22, !dbg !365 + %9355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 23, !dbg !365 + %9356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 24, !dbg !365 + %9357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 25, !dbg !365 + %9358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 26, !dbg !365 + %9359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 27, !dbg !365 + %9360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 28, !dbg !365 + %9361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 29, !dbg !365 + %9362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 30, !dbg !365 + %9363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 31, !dbg !365 + %9364 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9332, float %9333, float %9334, float %9335, float %9336, float %9337, float %9338, float %9339, float %9340, float %9341, float %9342, float %9343, float %9344, float %9345, float %9346, float %9347, float %9348, float %9349, float %9350, float %9351, float %9352, float %9353, float %9354, float %9355, float %9356, float %9357, float %9358, float %9359, float %9360, float %9361, float %9362, float %9363, i64 %9326, i64 %9331, i1 true) #3, !dbg !365 + %9365 = or disjoint i32 %9133, 16448, !dbg !365 + %9366 = add i32 %9365, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9367 = lshr exact i32 %9366, 4, !dbg !365 + %9368 = and i32 %9367, 16383, !dbg !365 + %9369 = zext nneg i32 %9368 to i64, !dbg !365 + %9370 = or disjoint i64 %9369, 4611686293372403712, !dbg !365 + %9371 = add i32 %9139, 8256, !dbg !365 + %9372 = lshr exact i32 %9371, 4, !dbg !365 + %9373 = and i32 %9372, 16383, !dbg !365 + %9374 = zext nneg i32 %9373 to i64, !dbg !365 + %9375 = or disjoint i64 %9374, 4611686293338849280, !dbg !365 + %9376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 0, !dbg !365 + %9377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 1, !dbg !365 + %9378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 2, !dbg !365 + %9379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 3, !dbg !365 + %9380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 4, !dbg !365 + %9381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 5, !dbg !365 + %9382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 6, !dbg !365 + %9383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 7, !dbg !365 + %9384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 8, !dbg !365 + %9385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 9, !dbg !365 + %9386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 10, !dbg !365 + %9387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 11, !dbg !365 + %9388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 12, !dbg !365 + %9389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 13, !dbg !365 + %9390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 14, !dbg !365 + %9391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 15, !dbg !365 + %9392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 16, !dbg !365 + %9393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 17, !dbg !365 + %9394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 18, !dbg !365 + %9395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 19, !dbg !365 + %9396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 20, !dbg !365 + %9397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 21, !dbg !365 + %9398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 22, !dbg !365 + %9399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 23, !dbg !365 + %9400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 24, !dbg !365 + %9401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 25, !dbg !365 + %9402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 26, !dbg !365 + %9403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 27, !dbg !365 + %9404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 28, !dbg !365 + %9405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 29, !dbg !365 + %9406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 30, !dbg !365 + %9407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 31, !dbg !365 + %9408 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9376, float %9377, float %9378, float %9379, float %9380, float %9381, float %9382, float %9383, float %9384, float %9385, float %9386, float %9387, float %9388, float %9389, float %9390, float %9391, float %9392, float %9393, float %9394, float %9395, float %9396, float %9397, float %9398, float %9399, float %9400, float %9401, float %9402, float %9403, float %9404, float %9405, float %9406, float %9407, i64 %9370, i64 %9375, i1 true) #3, !dbg !365 + %9409 = or disjoint i32 %9133, 16480, !dbg !365 + %9410 = add i32 %9409, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9411 = lshr exact i32 %9410, 4, !dbg !365 + %9412 = and i32 %9411, 16383, !dbg !365 + %9413 = zext nneg i32 %9412 to i64, !dbg !365 + %9414 = or disjoint i64 %9413, 4611686293372403712, !dbg !365 + %9415 = add i32 %9139, 8288, !dbg !365 + %9416 = lshr exact i32 %9415, 4, !dbg !365 + %9417 = and i32 %9416, 16383, !dbg !365 + %9418 = zext nneg i32 %9417 to i64, !dbg !365 + %9419 = or disjoint i64 %9418, 4611686293338849280, !dbg !365 + %9420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 0, !dbg !365 + %9421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 1, !dbg !365 + %9422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 2, !dbg !365 + %9423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 3, !dbg !365 + %9424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 4, !dbg !365 + %9425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 5, !dbg !365 + %9426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 6, !dbg !365 + %9427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 7, !dbg !365 + %9428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 8, !dbg !365 + %9429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 9, !dbg !365 + %9430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 10, !dbg !365 + %9431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 11, !dbg !365 + %9432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 12, !dbg !365 + %9433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 13, !dbg !365 + %9434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 14, !dbg !365 + %9435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 15, !dbg !365 + %9436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 16, !dbg !365 + %9437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 17, !dbg !365 + %9438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 18, !dbg !365 + %9439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 19, !dbg !365 + %9440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 20, !dbg !365 + %9441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 21, !dbg !365 + %9442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 22, !dbg !365 + %9443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 23, !dbg !365 + %9444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 24, !dbg !365 + %9445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 25, !dbg !365 + %9446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 26, !dbg !365 + %9447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 27, !dbg !365 + %9448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 28, !dbg !365 + %9449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 29, !dbg !365 + %9450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 30, !dbg !365 + %9451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 31, !dbg !365 + %9452 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9420, float %9421, float %9422, float %9423, float %9424, float %9425, float %9426, float %9427, float %9428, float %9429, float %9430, float %9431, float %9432, float %9433, float %9434, float %9435, float %9436, float %9437, float %9438, float %9439, float %9440, float %9441, float %9442, float %9443, float %9444, float %9445, float %9446, float %9447, float %9448, float %9449, float %9450, float %9451, i64 %9414, i64 %9419, i1 true) #3, !dbg !365 + %9453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 0, !dbg !365 + %9454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 1, !dbg !365 + %9455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 2, !dbg !365 + %9456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 3, !dbg !365 + %9457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 4, !dbg !365 + %9458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 5, !dbg !365 + %9459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 6, !dbg !365 + %9460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 7, !dbg !365 + %9461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 8, !dbg !365 + %9462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 9, !dbg !365 + %9463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 10, !dbg !365 + %9464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 11, !dbg !365 + %9465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 12, !dbg !365 + %9466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 13, !dbg !365 + %9467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 14, !dbg !365 + %9468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 15, !dbg !365 + %9469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 16, !dbg !365 + %9470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 17, !dbg !365 + %9471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 18, !dbg !365 + %9472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 19, !dbg !365 + %9473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 20, !dbg !365 + %9474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 21, !dbg !365 + %9475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 22, !dbg !365 + %9476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 23, !dbg !365 + %9477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 24, !dbg !365 + %9478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 25, !dbg !365 + %9479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 26, !dbg !365 + %9480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 27, !dbg !365 + %9481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 28, !dbg !365 + %9482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 29, !dbg !365 + %9483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 30, !dbg !365 + %9484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 31, !dbg !365 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !365 + %9485 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %9453, float %9454, float %9455, float %9456, float %9457, float %9458, float %9459, float %9460, float %9461, float %9462, float %9463, float %9464, float %9465, float %9466, float %9467, float %9468, float %9469, float %9470, float %9471, float %9472, float %9473, float %9474, float %9475, float %9476, float %9477, float %9478, float %9479, float %9480, float %9481, float %9482, float %9483, float %9484, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %9064, i32 0, i32 0) #3, !dbg !365 + %9486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 0, !dbg !365 + %9487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 1, !dbg !365 + %9488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 2, !dbg !365 + %9489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 3, !dbg !365 + %9490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 4, !dbg !365 + %9491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 5, !dbg !365 + %9492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 6, !dbg !365 + %9493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 7, !dbg !365 + %9494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 8, !dbg !365 + %9495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 9, !dbg !365 + %9496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 10, !dbg !365 + %9497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 11, !dbg !365 + %9498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 12, !dbg !365 + %9499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 13, !dbg !365 + %9500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 14, !dbg !365 + %9501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 15, !dbg !365 + %9502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 16, !dbg !365 + %9503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 17, !dbg !365 + %9504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 18, !dbg !365 + %9505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 19, !dbg !365 + %9506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 20, !dbg !365 + %9507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 21, !dbg !365 + %9508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 22, !dbg !365 + %9509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 23, !dbg !365 + %9510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 24, !dbg !365 + %9511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 25, !dbg !365 + %9512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 26, !dbg !365 + %9513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 27, !dbg !365 + %9514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 28, !dbg !365 + %9515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 29, !dbg !365 + %9516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 30, !dbg !365 + %9517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 31, !dbg !365 + %9518 = fmul float %9486, 0x3FB6A09E60000000, !dbg !366 + %9519 = fmul float %9487, 0x3FB6A09E60000000, !dbg !366 + %9520 = fmul float %9488, 0x3FB6A09E60000000, !dbg !366 + %9521 = fmul float %9489, 0x3FB6A09E60000000, !dbg !366 + %9522 = fmul float %9490, 0x3FB6A09E60000000, !dbg !366 + %9523 = fmul float %9491, 0x3FB6A09E60000000, !dbg !366 + %9524 = fmul float %9492, 0x3FB6A09E60000000, !dbg !366 + %9525 = fmul float %9493, 0x3FB6A09E60000000, !dbg !366 + %9526 = fmul float %9494, 0x3FB6A09E60000000, !dbg !366 + %9527 = fmul float %9495, 0x3FB6A09E60000000, !dbg !366 + %9528 = fmul float %9496, 0x3FB6A09E60000000, !dbg !366 + %9529 = fmul float %9497, 0x3FB6A09E60000000, !dbg !366 + %9530 = fmul float %9498, 0x3FB6A09E60000000, !dbg !366 + %9531 = fmul float %9499, 0x3FB6A09E60000000, !dbg !366 + %9532 = fmul float %9500, 0x3FB6A09E60000000, !dbg !366 + %9533 = fmul float %9501, 0x3FB6A09E60000000, !dbg !366 + %9534 = fmul float %9502, 0x3FB6A09E60000000, !dbg !366 + %9535 = fmul float %9503, 0x3FB6A09E60000000, !dbg !366 + %9536 = fmul float %9504, 0x3FB6A09E60000000, !dbg !366 + %9537 = fmul float %9505, 0x3FB6A09E60000000, !dbg !366 + %9538 = fmul float %9506, 0x3FB6A09E60000000, !dbg !366 + %9539 = fmul float %9507, 0x3FB6A09E60000000, !dbg !366 + %9540 = fmul float %9508, 0x3FB6A09E60000000, !dbg !366 + %9541 = fmul float %9509, 0x3FB6A09E60000000, !dbg !366 + %9542 = fmul float %9510, 0x3FB6A09E60000000, !dbg !366 + %9543 = fmul float %9511, 0x3FB6A09E60000000, !dbg !366 + %9544 = fmul float %9512, 0x3FB6A09E60000000, !dbg !366 + %9545 = fmul float %9513, 0x3FB6A09E60000000, !dbg !366 + %9546 = fmul float %9514, 0x3FB6A09E60000000, !dbg !366 + %9547 = fmul float %9515, 0x3FB6A09E60000000, !dbg !366 + %9548 = fmul float %9516, 0x3FB6A09E60000000, !dbg !366 + %9549 = fmul float %9517, 0x3FB6A09E60000000, !dbg !366 + %9550 = fmul float %9518, 0x3FF7154760000000, !dbg !367 + %9551 = select i1 %9047, float %9550, float 0xFFF0000000000000, !dbg !368 + %9552 = fmul float %9519, 0x3FF7154760000000, !dbg !367 + %9553 = select i1 %9048, float %9552, float 0xFFF0000000000000, !dbg !368 + %9554 = fmul float %9520, 0x3FF7154760000000, !dbg !367 + %9555 = select i1 %9047, float %9554, float 0xFFF0000000000000, !dbg !368 + %9556 = fmul float %9521, 0x3FF7154760000000, !dbg !367 + %9557 = select i1 %9048, float %9556, float 0xFFF0000000000000, !dbg !368 + %9558 = fmul float %9522, 0x3FF7154760000000, !dbg !367 + %9559 = select i1 %9049, float %9558, float 0xFFF0000000000000, !dbg !368 + %9560 = fmul float %9523, 0x3FF7154760000000, !dbg !367 + %9561 = select i1 %9050, float %9560, float 0xFFF0000000000000, !dbg !368 + %9562 = fmul float %9524, 0x3FF7154760000000, !dbg !367 + %9563 = select i1 %9049, float %9562, float 0xFFF0000000000000, !dbg !368 + %9564 = fmul float %9525, 0x3FF7154760000000, !dbg !367 + %9565 = select i1 %9050, float %9564, float 0xFFF0000000000000, !dbg !368 + %9566 = fmul float %9526, 0x3FF7154760000000, !dbg !367 + %9567 = select i1 %9051, float %9566, float 0xFFF0000000000000, !dbg !368 + %9568 = fmul float %9527, 0x3FF7154760000000, !dbg !367 + %9569 = select i1 %9052, float %9568, float 0xFFF0000000000000, !dbg !368 + %9570 = fmul float %9528, 0x3FF7154760000000, !dbg !367 + %9571 = select i1 %9051, float %9570, float 0xFFF0000000000000, !dbg !368 + %9572 = fmul float %9529, 0x3FF7154760000000, !dbg !367 + %9573 = select i1 %9052, float %9572, float 0xFFF0000000000000, !dbg !368 + %9574 = fmul float %9530, 0x3FF7154760000000, !dbg !367 + %9575 = select i1 %9053, float %9574, float 0xFFF0000000000000, !dbg !368 + %9576 = fmul float %9531, 0x3FF7154760000000, !dbg !367 + %9577 = select i1 %9054, float %9576, float 0xFFF0000000000000, !dbg !368 + %9578 = fmul float %9532, 0x3FF7154760000000, !dbg !367 + %9579 = select i1 %9053, float %9578, float 0xFFF0000000000000, !dbg !368 + %9580 = fmul float %9533, 0x3FF7154760000000, !dbg !367 + %9581 = select i1 %9054, float %9580, float 0xFFF0000000000000, !dbg !368 + %9582 = fmul float %9534, 0x3FF7154760000000, !dbg !367 + %9583 = select i1 %9055, float %9582, float 0xFFF0000000000000, !dbg !368 + %9584 = fmul float %9535, 0x3FF7154760000000, !dbg !367 + %9585 = select i1 %9056, float %9584, float 0xFFF0000000000000, !dbg !368 + %9586 = fmul float %9536, 0x3FF7154760000000, !dbg !367 + %9587 = select i1 %9055, float %9586, float 0xFFF0000000000000, !dbg !368 + %9588 = fmul float %9537, 0x3FF7154760000000, !dbg !367 + %9589 = select i1 %9056, float %9588, float 0xFFF0000000000000, !dbg !368 + %9590 = fmul float %9538, 0x3FF7154760000000, !dbg !367 + %9591 = select i1 %9057, float %9590, float 0xFFF0000000000000, !dbg !368 + %9592 = fmul float %9539, 0x3FF7154760000000, !dbg !367 + %9593 = select i1 %9058, float %9592, float 0xFFF0000000000000, !dbg !368 + %9594 = fmul float %9540, 0x3FF7154760000000, !dbg !367 + %9595 = select i1 %9057, float %9594, float 0xFFF0000000000000, !dbg !368 + %9596 = fmul float %9541, 0x3FF7154760000000, !dbg !367 + %9597 = select i1 %9058, float %9596, float 0xFFF0000000000000, !dbg !368 + %9598 = fmul float %9542, 0x3FF7154760000000, !dbg !367 + %9599 = select i1 %9059, float %9598, float 0xFFF0000000000000, !dbg !368 + %9600 = fmul float %9543, 0x3FF7154760000000, !dbg !367 + %9601 = select i1 %9060, float %9600, float 0xFFF0000000000000, !dbg !368 + %9602 = fmul float %9544, 0x3FF7154760000000, !dbg !367 + %9603 = select i1 %9059, float %9602, float 0xFFF0000000000000, !dbg !368 + %9604 = fmul float %9545, 0x3FF7154760000000, !dbg !367 + %9605 = select i1 %9060, float %9604, float 0xFFF0000000000000, !dbg !368 + %9606 = fmul float %9546, 0x3FF7154760000000, !dbg !367 + %9607 = select i1 %9061, float %9606, float 0xFFF0000000000000, !dbg !368 + %9608 = fmul float %9547, 0x3FF7154760000000, !dbg !367 + %9609 = select i1 %9062, float %9608, float 0xFFF0000000000000, !dbg !368 + %9610 = fmul float %9548, 0x3FF7154760000000, !dbg !367 + %9611 = select i1 %9061, float %9610, float 0xFFF0000000000000, !dbg !368 + %9612 = fmul float %9549, 0x3FF7154760000000, !dbg !367 + %9613 = select i1 %9062, float %9612, float 0xFFF0000000000000, !dbg !368 + %9614 = fsub float %9551, %9115, !dbg !369 + %9615 = fsub float %9553, %9116, !dbg !369 + %9616 = fsub float %9555, %9115, !dbg !369 + %9617 = fsub float %9557, %9116, !dbg !369 + %9618 = fsub float %9559, %9117, !dbg !369 + %9619 = fsub float %9561, %9118, !dbg !369 + %9620 = fsub float %9563, %9117, !dbg !369 + %9621 = fsub float %9565, %9118, !dbg !369 + %9622 = fsub float %9567, %9119, !dbg !369 + %9623 = fsub float %9569, %9120, !dbg !369 + %9624 = fsub float %9571, %9119, !dbg !369 + %9625 = fsub float %9573, %9120, !dbg !369 + %9626 = fsub float %9575, %9121, !dbg !369 + %9627 = fsub float %9577, %9122, !dbg !369 + %9628 = fsub float %9579, %9121, !dbg !369 + %9629 = fsub float %9581, %9122, !dbg !369 + %9630 = fsub float %9583, %9123, !dbg !369 + %9631 = fsub float %9585, %9124, !dbg !369 + %9632 = fsub float %9587, %9123, !dbg !369 + %9633 = fsub float %9589, %9124, !dbg !369 + %9634 = fsub float %9591, %9125, !dbg !369 + %9635 = fsub float %9593, %9126, !dbg !369 + %9636 = fsub float %9595, %9125, !dbg !369 + %9637 = fsub float %9597, %9126, !dbg !369 + %9638 = fsub float %9599, %9127, !dbg !369 + %9639 = fsub float %9601, %9128, !dbg !369 + %9640 = fsub float %9603, %9127, !dbg !369 + %9641 = fsub float %9605, %9128, !dbg !369 + %9642 = fsub float %9607, %9129, !dbg !369 + %9643 = fsub float %9609, %9130, !dbg !369 + %9644 = fsub float %9611, %9129, !dbg !369 + %9645 = fsub float %9613, %9130, !dbg !369 + %9646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i = icmp eq i32 %9646, 0, !dbg !370 + br i1 %.not.i, label %9649, label %9647, !dbg !370 + +9647: ; preds = %.lr.ph1793 + %9648 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9614) #3, !dbg !370 + br label %__nv_exp2f.exit, !dbg !370 + +9649: ; preds = %.lr.ph1793 + %9650 = tail call float @llvm.nvvm.ex2.approx.f(float %9614) #3, !dbg !370 + br label %__nv_exp2f.exit, !dbg !370 + +__nv_exp2f.exit: ; preds = %9647, %9649 + %.0.i = phi float [ %9648, %9647 ], [ %9650, %9649 ], !dbg !370 + %9651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1147 = icmp eq i32 %9651, 0, !dbg !370 + br i1 %.not.i1147, label %9654, label %9652, !dbg !370 + +9652: ; preds = %__nv_exp2f.exit + %9653 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9615) #3, !dbg !370 + br label %__nv_exp2f.exit1149, !dbg !370 + +9654: ; preds = %__nv_exp2f.exit + %9655 = tail call float @llvm.nvvm.ex2.approx.f(float %9615) #3, !dbg !370 + br label %__nv_exp2f.exit1149, !dbg !370 + +__nv_exp2f.exit1149: ; preds = %9652, %9654 + %.0.i1148 = phi float [ %9653, %9652 ], [ %9655, %9654 ], !dbg !370 + %9656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1150 = icmp eq i32 %9656, 0, !dbg !370 + br i1 %.not.i1150, label %9659, label %9657, !dbg !370 + +9657: ; preds = %__nv_exp2f.exit1149 + %9658 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9616) #3, !dbg !370 + br label %__nv_exp2f.exit1152, !dbg !370 + +9659: ; preds = %__nv_exp2f.exit1149 + %9660 = tail call float @llvm.nvvm.ex2.approx.f(float %9616) #3, !dbg !370 + br label %__nv_exp2f.exit1152, !dbg !370 + +__nv_exp2f.exit1152: ; preds = %9657, %9659 + %.0.i1151 = phi float [ %9658, %9657 ], [ %9660, %9659 ], !dbg !370 + %9661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1153 = icmp eq i32 %9661, 0, !dbg !370 + br i1 %.not.i1153, label %9664, label %9662, !dbg !370 + +9662: ; preds = %__nv_exp2f.exit1152 + %9663 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9617) #3, !dbg !370 + br label %__nv_exp2f.exit1155, !dbg !370 + +9664: ; preds = %__nv_exp2f.exit1152 + %9665 = tail call float @llvm.nvvm.ex2.approx.f(float %9617) #3, !dbg !370 + br label %__nv_exp2f.exit1155, !dbg !370 + +__nv_exp2f.exit1155: ; preds = %9662, %9664 + %.0.i1154 = phi float [ %9663, %9662 ], [ %9665, %9664 ], !dbg !370 + %9666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1156 = icmp eq i32 %9666, 0, !dbg !370 + br i1 %.not.i1156, label %9669, label %9667, !dbg !370 + +9667: ; preds = %__nv_exp2f.exit1155 + %9668 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9618) #3, !dbg !370 + br label %__nv_exp2f.exit1158, !dbg !370 + +9669: ; preds = %__nv_exp2f.exit1155 + %9670 = tail call float @llvm.nvvm.ex2.approx.f(float %9618) #3, !dbg !370 + br label %__nv_exp2f.exit1158, !dbg !370 + +__nv_exp2f.exit1158: ; preds = %9667, %9669 + %.0.i1157 = phi float [ %9668, %9667 ], [ %9670, %9669 ], !dbg !370 + %9671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1159 = icmp eq i32 %9671, 0, !dbg !370 + br i1 %.not.i1159, label %9674, label %9672, !dbg !370 + +9672: ; preds = %__nv_exp2f.exit1158 + %9673 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9619) #3, !dbg !370 + br label %__nv_exp2f.exit1161, !dbg !370 + +9674: ; preds = %__nv_exp2f.exit1158 + %9675 = tail call float @llvm.nvvm.ex2.approx.f(float %9619) #3, !dbg !370 + br label %__nv_exp2f.exit1161, !dbg !370 + +__nv_exp2f.exit1161: ; preds = %9672, %9674 + %.0.i1160 = phi float [ %9673, %9672 ], [ %9675, %9674 ], !dbg !370 + %9676 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1162 = icmp eq i32 %9676, 0, !dbg !370 + br i1 %.not.i1162, label %9679, label %9677, !dbg !370 + +9677: ; preds = %__nv_exp2f.exit1161 + %9678 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9620) #3, !dbg !370 + br label %__nv_exp2f.exit1164, !dbg !370 + +9679: ; preds = %__nv_exp2f.exit1161 + %9680 = tail call float @llvm.nvvm.ex2.approx.f(float %9620) #3, !dbg !370 + br label %__nv_exp2f.exit1164, !dbg !370 + +__nv_exp2f.exit1164: ; preds = %9677, %9679 + %.0.i1163 = phi float [ %9678, %9677 ], [ %9680, %9679 ], !dbg !370 + %9681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1165 = icmp eq i32 %9681, 0, !dbg !370 + br i1 %.not.i1165, label %9684, label %9682, !dbg !370 + +9682: ; preds = %__nv_exp2f.exit1164 + %9683 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9621) #3, !dbg !370 + br label %__nv_exp2f.exit1167, !dbg !370 + +9684: ; preds = %__nv_exp2f.exit1164 + %9685 = tail call float @llvm.nvvm.ex2.approx.f(float %9621) #3, !dbg !370 + br label %__nv_exp2f.exit1167, !dbg !370 + +__nv_exp2f.exit1167: ; preds = %9682, %9684 + %.0.i1166 = phi float [ %9683, %9682 ], [ %9685, %9684 ], !dbg !370 + %9686 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1168 = icmp eq i32 %9686, 0, !dbg !370 + br i1 %.not.i1168, label %9689, label %9687, !dbg !370 + +9687: ; preds = %__nv_exp2f.exit1167 + %9688 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9622) #3, !dbg !370 + br label %__nv_exp2f.exit1170, !dbg !370 + +9689: ; preds = %__nv_exp2f.exit1167 + %9690 = tail call float @llvm.nvvm.ex2.approx.f(float %9622) #3, !dbg !370 + br label %__nv_exp2f.exit1170, !dbg !370 + +__nv_exp2f.exit1170: ; preds = %9687, %9689 + %.0.i1169 = phi float [ %9688, %9687 ], [ %9690, %9689 ], !dbg !370 + %9691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1171 = icmp eq i32 %9691, 0, !dbg !370 + br i1 %.not.i1171, label %9694, label %9692, !dbg !370 + +9692: ; preds = %__nv_exp2f.exit1170 + %9693 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9623) #3, !dbg !370 + br label %__nv_exp2f.exit1173, !dbg !370 + +9694: ; preds = %__nv_exp2f.exit1170 + %9695 = tail call float @llvm.nvvm.ex2.approx.f(float %9623) #3, !dbg !370 + br label %__nv_exp2f.exit1173, !dbg !370 + +__nv_exp2f.exit1173: ; preds = %9692, %9694 + %.0.i1172 = phi float [ %9693, %9692 ], [ %9695, %9694 ], !dbg !370 + %9696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1174 = icmp eq i32 %9696, 0, !dbg !370 + br i1 %.not.i1174, label %9699, label %9697, !dbg !370 + +9697: ; preds = %__nv_exp2f.exit1173 + %9698 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9624) #3, !dbg !370 + br label %__nv_exp2f.exit1176, !dbg !370 + +9699: ; preds = %__nv_exp2f.exit1173 + %9700 = tail call float @llvm.nvvm.ex2.approx.f(float %9624) #3, !dbg !370 + br label %__nv_exp2f.exit1176, !dbg !370 + +__nv_exp2f.exit1176: ; preds = %9697, %9699 + %.0.i1175 = phi float [ %9698, %9697 ], [ %9700, %9699 ], !dbg !370 + %9701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1177 = icmp eq i32 %9701, 0, !dbg !370 + br i1 %.not.i1177, label %9704, label %9702, !dbg !370 + +9702: ; preds = %__nv_exp2f.exit1176 + %9703 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9625) #3, !dbg !370 + br label %__nv_exp2f.exit1179, !dbg !370 + +9704: ; preds = %__nv_exp2f.exit1176 + %9705 = tail call float @llvm.nvvm.ex2.approx.f(float %9625) #3, !dbg !370 + br label %__nv_exp2f.exit1179, !dbg !370 + +__nv_exp2f.exit1179: ; preds = %9702, %9704 + %.0.i1178 = phi float [ %9703, %9702 ], [ %9705, %9704 ], !dbg !370 + %9706 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1180 = icmp eq i32 %9706, 0, !dbg !370 + br i1 %.not.i1180, label %9709, label %9707, !dbg !370 + +9707: ; preds = %__nv_exp2f.exit1179 + %9708 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9626) #3, !dbg !370 + br label %__nv_exp2f.exit1182, !dbg !370 + +9709: ; preds = %__nv_exp2f.exit1179 + %9710 = tail call float @llvm.nvvm.ex2.approx.f(float %9626) #3, !dbg !370 + br label %__nv_exp2f.exit1182, !dbg !370 + +__nv_exp2f.exit1182: ; preds = %9707, %9709 + %.0.i1181 = phi float [ %9708, %9707 ], [ %9710, %9709 ], !dbg !370 + %9711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1183 = icmp eq i32 %9711, 0, !dbg !370 + br i1 %.not.i1183, label %9714, label %9712, !dbg !370 + +9712: ; preds = %__nv_exp2f.exit1182 + %9713 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9627) #3, !dbg !370 + br label %__nv_exp2f.exit1185, !dbg !370 + +9714: ; preds = %__nv_exp2f.exit1182 + %9715 = tail call float @llvm.nvvm.ex2.approx.f(float %9627) #3, !dbg !370 + br label %__nv_exp2f.exit1185, !dbg !370 + +__nv_exp2f.exit1185: ; preds = %9712, %9714 + %.0.i1184 = phi float [ %9713, %9712 ], [ %9715, %9714 ], !dbg !370 + %9716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1186 = icmp eq i32 %9716, 0, !dbg !370 + br i1 %.not.i1186, label %9719, label %9717, !dbg !370 + +9717: ; preds = %__nv_exp2f.exit1185 + %9718 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9628) #3, !dbg !370 + br label %__nv_exp2f.exit1188, !dbg !370 + +9719: ; preds = %__nv_exp2f.exit1185 + %9720 = tail call float @llvm.nvvm.ex2.approx.f(float %9628) #3, !dbg !370 + br label %__nv_exp2f.exit1188, !dbg !370 + +__nv_exp2f.exit1188: ; preds = %9717, %9719 + %.0.i1187 = phi float [ %9718, %9717 ], [ %9720, %9719 ], !dbg !370 + %9721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1189 = icmp eq i32 %9721, 0, !dbg !370 + br i1 %.not.i1189, label %9724, label %9722, !dbg !370 + +9722: ; preds = %__nv_exp2f.exit1188 + %9723 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9629) #3, !dbg !370 + br label %__nv_exp2f.exit1191, !dbg !370 + +9724: ; preds = %__nv_exp2f.exit1188 + %9725 = tail call float @llvm.nvvm.ex2.approx.f(float %9629) #3, !dbg !370 + br label %__nv_exp2f.exit1191, !dbg !370 + +__nv_exp2f.exit1191: ; preds = %9722, %9724 + %.0.i1190 = phi float [ %9723, %9722 ], [ %9725, %9724 ], !dbg !370 + %9726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1192 = icmp eq i32 %9726, 0, !dbg !370 + br i1 %.not.i1192, label %9729, label %9727, !dbg !370 + +9727: ; preds = %__nv_exp2f.exit1191 + %9728 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9630) #3, !dbg !370 + br label %__nv_exp2f.exit1194, !dbg !370 + +9729: ; preds = %__nv_exp2f.exit1191 + %9730 = tail call float @llvm.nvvm.ex2.approx.f(float %9630) #3, !dbg !370 + br label %__nv_exp2f.exit1194, !dbg !370 + +__nv_exp2f.exit1194: ; preds = %9727, %9729 + %.0.i1193 = phi float [ %9728, %9727 ], [ %9730, %9729 ], !dbg !370 + %9731 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1195 = icmp eq i32 %9731, 0, !dbg !370 + br i1 %.not.i1195, label %9734, label %9732, !dbg !370 + +9732: ; preds = %__nv_exp2f.exit1194 + %9733 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9631) #3, !dbg !370 + br label %__nv_exp2f.exit1197, !dbg !370 + +9734: ; preds = %__nv_exp2f.exit1194 + %9735 = tail call float @llvm.nvvm.ex2.approx.f(float %9631) #3, !dbg !370 + br label %__nv_exp2f.exit1197, !dbg !370 + +__nv_exp2f.exit1197: ; preds = %9732, %9734 + %.0.i1196 = phi float [ %9733, %9732 ], [ %9735, %9734 ], !dbg !370 + %9736 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1198 = icmp eq i32 %9736, 0, !dbg !370 + br i1 %.not.i1198, label %9739, label %9737, !dbg !370 + +9737: ; preds = %__nv_exp2f.exit1197 + %9738 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9632) #3, !dbg !370 + br label %__nv_exp2f.exit1200, !dbg !370 + +9739: ; preds = %__nv_exp2f.exit1197 + %9740 = tail call float @llvm.nvvm.ex2.approx.f(float %9632) #3, !dbg !370 + br label %__nv_exp2f.exit1200, !dbg !370 + +__nv_exp2f.exit1200: ; preds = %9737, %9739 + %.0.i1199 = phi float [ %9738, %9737 ], [ %9740, %9739 ], !dbg !370 + %9741 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1201 = icmp eq i32 %9741, 0, !dbg !370 + br i1 %.not.i1201, label %9744, label %9742, !dbg !370 + +9742: ; preds = %__nv_exp2f.exit1200 + %9743 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9633) #3, !dbg !370 + br label %__nv_exp2f.exit1203, !dbg !370 + +9744: ; preds = %__nv_exp2f.exit1200 + %9745 = tail call float @llvm.nvvm.ex2.approx.f(float %9633) #3, !dbg !370 + br label %__nv_exp2f.exit1203, !dbg !370 + +__nv_exp2f.exit1203: ; preds = %9742, %9744 + %.0.i1202 = phi float [ %9743, %9742 ], [ %9745, %9744 ], !dbg !370 + %9746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1204 = icmp eq i32 %9746, 0, !dbg !370 + br i1 %.not.i1204, label %9749, label %9747, !dbg !370 + +9747: ; preds = %__nv_exp2f.exit1203 + %9748 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9634) #3, !dbg !370 + br label %__nv_exp2f.exit1206, !dbg !370 + +9749: ; preds = %__nv_exp2f.exit1203 + %9750 = tail call float @llvm.nvvm.ex2.approx.f(float %9634) #3, !dbg !370 + br label %__nv_exp2f.exit1206, !dbg !370 + +__nv_exp2f.exit1206: ; preds = %9747, %9749 + %.0.i1205 = phi float [ %9748, %9747 ], [ %9750, %9749 ], !dbg !370 + %9751 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1207 = icmp eq i32 %9751, 0, !dbg !370 + br i1 %.not.i1207, label %9754, label %9752, !dbg !370 + +9752: ; preds = %__nv_exp2f.exit1206 + %9753 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9635) #3, !dbg !370 + br label %__nv_exp2f.exit1209, !dbg !370 + +9754: ; preds = %__nv_exp2f.exit1206 + %9755 = tail call float @llvm.nvvm.ex2.approx.f(float %9635) #3, !dbg !370 + br label %__nv_exp2f.exit1209, !dbg !370 + +__nv_exp2f.exit1209: ; preds = %9752, %9754 + %.0.i1208 = phi float [ %9753, %9752 ], [ %9755, %9754 ], !dbg !370 + %9756 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1210 = icmp eq i32 %9756, 0, !dbg !370 + br i1 %.not.i1210, label %9759, label %9757, !dbg !370 + +9757: ; preds = %__nv_exp2f.exit1209 + %9758 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9636) #3, !dbg !370 + br label %__nv_exp2f.exit1212, !dbg !370 + +9759: ; preds = %__nv_exp2f.exit1209 + %9760 = tail call float @llvm.nvvm.ex2.approx.f(float %9636) #3, !dbg !370 + br label %__nv_exp2f.exit1212, !dbg !370 + +__nv_exp2f.exit1212: ; preds = %9757, %9759 + %.0.i1211 = phi float [ %9758, %9757 ], [ %9760, %9759 ], !dbg !370 + %9761 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1213 = icmp eq i32 %9761, 0, !dbg !370 + br i1 %.not.i1213, label %9764, label %9762, !dbg !370 + +9762: ; preds = %__nv_exp2f.exit1212 + %9763 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9637) #3, !dbg !370 + br label %__nv_exp2f.exit1215, !dbg !370 + +9764: ; preds = %__nv_exp2f.exit1212 + %9765 = tail call float @llvm.nvvm.ex2.approx.f(float %9637) #3, !dbg !370 + br label %__nv_exp2f.exit1215, !dbg !370 + +__nv_exp2f.exit1215: ; preds = %9762, %9764 + %.0.i1214 = phi float [ %9763, %9762 ], [ %9765, %9764 ], !dbg !370 + %9766 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1216 = icmp eq i32 %9766, 0, !dbg !370 + br i1 %.not.i1216, label %9769, label %9767, !dbg !370 + +9767: ; preds = %__nv_exp2f.exit1215 + %9768 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9638) #3, !dbg !370 + br label %__nv_exp2f.exit1218, !dbg !370 + +9769: ; preds = %__nv_exp2f.exit1215 + %9770 = tail call float @llvm.nvvm.ex2.approx.f(float %9638) #3, !dbg !370 + br label %__nv_exp2f.exit1218, !dbg !370 + +__nv_exp2f.exit1218: ; preds = %9767, %9769 + %.0.i1217 = phi float [ %9768, %9767 ], [ %9770, %9769 ], !dbg !370 + %9771 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1219 = icmp eq i32 %9771, 0, !dbg !370 + br i1 %.not.i1219, label %9774, label %9772, !dbg !370 + +9772: ; preds = %__nv_exp2f.exit1218 + %9773 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9639) #3, !dbg !370 + br label %__nv_exp2f.exit1221, !dbg !370 + +9774: ; preds = %__nv_exp2f.exit1218 + %9775 = tail call float @llvm.nvvm.ex2.approx.f(float %9639) #3, !dbg !370 + br label %__nv_exp2f.exit1221, !dbg !370 + +__nv_exp2f.exit1221: ; preds = %9772, %9774 + %.0.i1220 = phi float [ %9773, %9772 ], [ %9775, %9774 ], !dbg !370 + %9776 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1222 = icmp eq i32 %9776, 0, !dbg !370 + br i1 %.not.i1222, label %9779, label %9777, !dbg !370 + +9777: ; preds = %__nv_exp2f.exit1221 + %9778 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9640) #3, !dbg !370 + br label %__nv_exp2f.exit1224, !dbg !370 + +9779: ; preds = %__nv_exp2f.exit1221 + %9780 = tail call float @llvm.nvvm.ex2.approx.f(float %9640) #3, !dbg !370 + br label %__nv_exp2f.exit1224, !dbg !370 + +__nv_exp2f.exit1224: ; preds = %9777, %9779 + %.0.i1223 = phi float [ %9778, %9777 ], [ %9780, %9779 ], !dbg !370 + %9781 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1225 = icmp eq i32 %9781, 0, !dbg !370 + br i1 %.not.i1225, label %9784, label %9782, !dbg !370 + +9782: ; preds = %__nv_exp2f.exit1224 + %9783 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9641) #3, !dbg !370 + br label %__nv_exp2f.exit1227, !dbg !370 + +9784: ; preds = %__nv_exp2f.exit1224 + %9785 = tail call float @llvm.nvvm.ex2.approx.f(float %9641) #3, !dbg !370 + br label %__nv_exp2f.exit1227, !dbg !370 + +__nv_exp2f.exit1227: ; preds = %9782, %9784 + %.0.i1226 = phi float [ %9783, %9782 ], [ %9785, %9784 ], !dbg !370 + %9786 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1228 = icmp eq i32 %9786, 0, !dbg !370 + br i1 %.not.i1228, label %9789, label %9787, !dbg !370 + +9787: ; preds = %__nv_exp2f.exit1227 + %9788 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9642) #3, !dbg !370 + br label %__nv_exp2f.exit1230, !dbg !370 + +9789: ; preds = %__nv_exp2f.exit1227 + %9790 = tail call float @llvm.nvvm.ex2.approx.f(float %9642) #3, !dbg !370 + br label %__nv_exp2f.exit1230, !dbg !370 + +__nv_exp2f.exit1230: ; preds = %9787, %9789 + %.0.i1229 = phi float [ %9788, %9787 ], [ %9790, %9789 ], !dbg !370 + %9791 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1231 = icmp eq i32 %9791, 0, !dbg !370 + br i1 %.not.i1231, label %9794, label %9792, !dbg !370 + +9792: ; preds = %__nv_exp2f.exit1230 + %9793 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9643) #3, !dbg !370 + br label %__nv_exp2f.exit1233, !dbg !370 + +9794: ; preds = %__nv_exp2f.exit1230 + %9795 = tail call float @llvm.nvvm.ex2.approx.f(float %9643) #3, !dbg !370 + br label %__nv_exp2f.exit1233, !dbg !370 + +__nv_exp2f.exit1233: ; preds = %9792, %9794 + %.0.i1232 = phi float [ %9793, %9792 ], [ %9795, %9794 ], !dbg !370 + %9796 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1234 = icmp eq i32 %9796, 0, !dbg !370 + br i1 %.not.i1234, label %9799, label %9797, !dbg !370 + +9797: ; preds = %__nv_exp2f.exit1233 + %9798 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9644) #3, !dbg !370 + br label %__nv_exp2f.exit1236, !dbg !370 + +9799: ; preds = %__nv_exp2f.exit1233 + %9800 = tail call float @llvm.nvvm.ex2.approx.f(float %9644) #3, !dbg !370 + br label %__nv_exp2f.exit1236, !dbg !370 + +__nv_exp2f.exit1236: ; preds = %9797, %9799 + %.0.i1235 = phi float [ %9798, %9797 ], [ %9800, %9799 ], !dbg !370 + %9801 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1237 = icmp eq i32 %9801, 0, !dbg !370 + br i1 %.not.i1237, label %9804, label %9802, !dbg !370 + +9802: ; preds = %__nv_exp2f.exit1236 + %9803 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9645) #3, !dbg !370 + br label %__nv_exp2f.exit1239, !dbg !370 + +9804: ; preds = %__nv_exp2f.exit1236 + %9805 = tail call float @llvm.nvvm.ex2.approx.f(float %9645) #3, !dbg !370 + br label %__nv_exp2f.exit1239, !dbg !370 + +__nv_exp2f.exit1239: ; preds = %9802, %9804 + %.0.i1238 = phi float [ %9803, %9802 ], [ %9805, %9804 ], !dbg !370 + %9806 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %9063, !dbg !356 + %9807 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !371 + %9808 = insertelement <2 x float> %9807, float %.0.i1148, i64 1, !dbg !371 + %9809 = fptrunc <2 x float> %9808 to <2 x bfloat>, !dbg !371 + %9810 = insertelement <2 x float> poison, float %.0.i1151, i64 0, !dbg !371 + %9811 = insertelement <2 x float> %9810, float %.0.i1154, i64 1, !dbg !371 + %9812 = fptrunc <2 x float> %9811 to <2 x bfloat>, !dbg !371 + %9813 = insertelement <2 x float> poison, float %.0.i1157, i64 0, !dbg !371 + %9814 = insertelement <2 x float> %9813, float %.0.i1160, i64 1, !dbg !371 + %9815 = fptrunc <2 x float> %9814 to <2 x bfloat>, !dbg !371 + %9816 = insertelement <2 x float> poison, float %.0.i1163, i64 0, !dbg !371 + %9817 = insertelement <2 x float> %9816, float %.0.i1166, i64 1, !dbg !371 + %9818 = fptrunc <2 x float> %9817 to <2 x bfloat>, !dbg !371 + %9819 = insertelement <2 x float> poison, float %.0.i1169, i64 0, !dbg !371 + %9820 = insertelement <2 x float> %9819, float %.0.i1172, i64 1, !dbg !371 + %9821 = fptrunc <2 x float> %9820 to <2 x bfloat>, !dbg !371 + %9822 = insertelement <2 x float> poison, float %.0.i1175, i64 0, !dbg !371 + %9823 = insertelement <2 x float> %9822, float %.0.i1178, i64 1, !dbg !371 + %9824 = fptrunc <2 x float> %9823 to <2 x bfloat>, !dbg !371 + %9825 = insertelement <2 x float> poison, float %.0.i1181, i64 0, !dbg !371 + %9826 = insertelement <2 x float> %9825, float %.0.i1184, i64 1, !dbg !371 + %9827 = fptrunc <2 x float> %9826 to <2 x bfloat>, !dbg !371 + %9828 = insertelement <2 x float> poison, float %.0.i1187, i64 0, !dbg !371 + %9829 = insertelement <2 x float> %9828, float %.0.i1190, i64 1, !dbg !371 + %9830 = fptrunc <2 x float> %9829 to <2 x bfloat>, !dbg !371 + %9831 = insertelement <2 x float> poison, float %.0.i1193, i64 0, !dbg !371 + %9832 = insertelement <2 x float> %9831, float %.0.i1196, i64 1, !dbg !371 + %9833 = fptrunc <2 x float> %9832 to <2 x bfloat>, !dbg !371 + %9834 = insertelement <2 x float> poison, float %.0.i1199, i64 0, !dbg !371 + %9835 = insertelement <2 x float> %9834, float %.0.i1202, i64 1, !dbg !371 + %9836 = fptrunc <2 x float> %9835 to <2 x bfloat>, !dbg !371 + %9837 = insertelement <2 x float> poison, float %.0.i1205, i64 0, !dbg !371 + %9838 = insertelement <2 x float> %9837, float %.0.i1208, i64 1, !dbg !371 + %9839 = fptrunc <2 x float> %9838 to <2 x bfloat>, !dbg !371 + %9840 = insertelement <2 x float> poison, float %.0.i1211, i64 0, !dbg !371 + %9841 = insertelement <2 x float> %9840, float %.0.i1214, i64 1, !dbg !371 + %9842 = fptrunc <2 x float> %9841 to <2 x bfloat>, !dbg !371 + %9843 = insertelement <2 x float> poison, float %.0.i1217, i64 0, !dbg !371 + %9844 = insertelement <2 x float> %9843, float %.0.i1220, i64 1, !dbg !371 + %9845 = fptrunc <2 x float> %9844 to <2 x bfloat>, !dbg !371 + %9846 = insertelement <2 x float> poison, float %.0.i1223, i64 0, !dbg !371 + %9847 = insertelement <2 x float> %9846, float %.0.i1226, i64 1, !dbg !371 + %9848 = fptrunc <2 x float> %9847 to <2 x bfloat>, !dbg !371 + %9849 = insertelement <2 x float> poison, float %.0.i1229, i64 0, !dbg !371 + %9850 = insertelement <2 x float> %9849, float %.0.i1232, i64 1, !dbg !371 + %9851 = fptrunc <2 x float> %9850 to <2 x bfloat>, !dbg !371 + %9852 = insertelement <2 x float> poison, float %.0.i1235, i64 0, !dbg !371 + %9853 = insertelement <2 x float> %9852, float %.0.i1238, i64 1, !dbg !371 + %9854 = fptrunc <2 x float> %9853 to <2 x bfloat>, !dbg !371 + %9855 = bitcast <2 x bfloat> %9809 to i32, !dbg !372 + %9856 = bitcast <2 x bfloat> %9812 to i32, !dbg !372 + %9857 = bitcast <2 x bfloat> %9815 to i32, !dbg !372 + %9858 = bitcast <2 x bfloat> %9818 to i32, !dbg !372 + %9859 = bitcast <2 x bfloat> %9821 to i32, !dbg !372 + %9860 = bitcast <2 x bfloat> %9824 to i32, !dbg !372 + %9861 = bitcast <2 x bfloat> %9827 to i32, !dbg !372 + %9862 = bitcast <2 x bfloat> %9830 to i32, !dbg !372 + %9863 = bitcast <2 x bfloat> %9833 to i32, !dbg !372 + %9864 = bitcast <2 x bfloat> %9836 to i32, !dbg !372 + %9865 = bitcast <2 x bfloat> %9839 to i32, !dbg !372 + %9866 = bitcast <2 x bfloat> %9842 to i32, !dbg !372 + %9867 = bitcast <2 x bfloat> %9845 to i32, !dbg !372 + %9868 = bitcast <2 x bfloat> %9848 to i32, !dbg !372 + %9869 = bitcast <2 x bfloat> %9851 to i32, !dbg !372 + %9870 = bitcast <2 x bfloat> %9854 to i32, !dbg !372 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !372 + %9871 = ptrtoint ptr addrspace(3) %9806 to i32, !dbg !372 + %9872 = lshr exact i32 %9871, 4, !dbg !372 + %9873 = and i32 %9872, 16383, !dbg !372 + %9874 = zext nneg i32 %9873 to i64, !dbg !372 + %9875 = or disjoint i64 %9874, 4611686293338849280, !dbg !372 + %9876 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn4751688, float %.pn4731689, float %.pn4711690, float %.pn4691691, float %.pn4671692, float %.pn4651693, float %.pn4631694, float %.pn4611695, float %.pn4591696, float %.pn4571697, float %.pn4551698, float %.pn4531699, float %.pn4511700, float %.pn4491701, float %.pn4471702, float %.pn4451703, float %.pn4431704, float %.pn4411705, float %.pn4391706, float %.pn4371707, float %.pn4351708, float %.pn4331709, float %.pn4311710, float %.pn4291711, float %.pn4271712, float %.pn4251713, float %.pn4231714, float %.pn4211715, float %.pn4191716, float %.pn4171717, float %.pn4151718, float %.pn4131719, float %.pn4111720, float %.pn4091721, float %.pn4071722, float %.pn4051723, float %.pn4031724, float %.pn4011725, float %.pn3991726, float %.pn3971727, float %.pn3951728, float %.pn3931729, float %.pn3911730, float %.pn3891731, float %.pn3871732, float %.pn3851733, float %.pn3831734, float %.pn3811735, float %.pn3791736, float %.pn3771737, float %.pn3751738, float %.pn3731739, float %.pn3711740, float %.pn3691741, float %.pn3671742, float %.pn3651743, float %.pn3631744, float %.pn3611745, float %.pn3591746, float %.pn3571747, float %.pn3551748, float %.pn3531749, float %.pn3511750, float %.pn3491751, i32 %9855, i32 %9856, i32 %9857, i32 %9858, i64 %9875, i1 true) #3, !dbg !372 + %9877 = add i32 %9871, 2048, !dbg !372 + %9878 = lshr exact i32 %9877, 4, !dbg !372 + %9879 = and i32 %9878, 16383, !dbg !372 + %9880 = zext nneg i32 %9879 to i64, !dbg !372 + %9881 = or disjoint i64 %9880, 4611686293338849280, !dbg !372 + %9882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 0, !dbg !372 + %9883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 1, !dbg !372 + %9884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 2, !dbg !372 + %9885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 3, !dbg !372 + %9886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 4, !dbg !372 + %9887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 5, !dbg !372 + %9888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 6, !dbg !372 + %9889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 7, !dbg !372 + %9890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 8, !dbg !372 + %9891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 9, !dbg !372 + %9892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 10, !dbg !372 + %9893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 11, !dbg !372 + %9894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 12, !dbg !372 + %9895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 13, !dbg !372 + %9896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 14, !dbg !372 + %9897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 15, !dbg !372 + %9898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 16, !dbg !372 + %9899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 17, !dbg !372 + %9900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 18, !dbg !372 + %9901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 19, !dbg !372 + %9902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 20, !dbg !372 + %9903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 21, !dbg !372 + %9904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 22, !dbg !372 + %9905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 23, !dbg !372 + %9906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 24, !dbg !372 + %9907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 25, !dbg !372 + %9908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 26, !dbg !372 + %9909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 27, !dbg !372 + %9910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 28, !dbg !372 + %9911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 29, !dbg !372 + %9912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 30, !dbg !372 + %9913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 31, !dbg !372 + %9914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 32, !dbg !372 + %9915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 33, !dbg !372 + %9916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 34, !dbg !372 + %9917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 35, !dbg !372 + %9918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 36, !dbg !372 + %9919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 37, !dbg !372 + %9920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 38, !dbg !372 + %9921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 39, !dbg !372 + %9922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 40, !dbg !372 + %9923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 41, !dbg !372 + %9924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 42, !dbg !372 + %9925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 43, !dbg !372 + %9926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 44, !dbg !372 + %9927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 45, !dbg !372 + %9928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 46, !dbg !372 + %9929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 47, !dbg !372 + %9930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 48, !dbg !372 + %9931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 49, !dbg !372 + %9932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 50, !dbg !372 + %9933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 51, !dbg !372 + %9934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 52, !dbg !372 + %9935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 53, !dbg !372 + %9936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 54, !dbg !372 + %9937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 55, !dbg !372 + %9938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 56, !dbg !372 + %9939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 57, !dbg !372 + %9940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 58, !dbg !372 + %9941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 59, !dbg !372 + %9942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 60, !dbg !372 + %9943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 61, !dbg !372 + %9944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 62, !dbg !372 + %9945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 63, !dbg !372 + %9946 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9882, float %9883, float %9884, float %9885, float %9886, float %9887, float %9888, float %9889, float %9890, float %9891, float %9892, float %9893, float %9894, float %9895, float %9896, float %9897, float %9898, float %9899, float %9900, float %9901, float %9902, float %9903, float %9904, float %9905, float %9906, float %9907, float %9908, float %9909, float %9910, float %9911, float %9912, float %9913, float %9914, float %9915, float %9916, float %9917, float %9918, float %9919, float %9920, float %9921, float %9922, float %9923, float %9924, float %9925, float %9926, float %9927, float %9928, float %9929, float %9930, float %9931, float %9932, float %9933, float %9934, float %9935, float %9936, float %9937, float %9938, float %9939, float %9940, float %9941, float %9942, float %9943, float %9944, float %9945, i32 %9859, i32 %9860, i32 %9861, i32 %9862, i64 %9881, i1 true) #3, !dbg !372 + %9947 = add i32 %9871, 4096, !dbg !372 + %9948 = lshr exact i32 %9947, 4, !dbg !372 + %9949 = and i32 %9948, 16383, !dbg !372 + %9950 = zext nneg i32 %9949 to i64, !dbg !372 + %9951 = or disjoint i64 %9950, 4611686293338849280, !dbg !372 + %9952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 0, !dbg !372 + %9953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 1, !dbg !372 + %9954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 2, !dbg !372 + %9955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 3, !dbg !372 + %9956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 4, !dbg !372 + %9957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 5, !dbg !372 + %9958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 6, !dbg !372 + %9959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 7, !dbg !372 + %9960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 8, !dbg !372 + %9961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 9, !dbg !372 + %9962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 10, !dbg !372 + %9963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 11, !dbg !372 + %9964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 12, !dbg !372 + %9965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 13, !dbg !372 + %9966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 14, !dbg !372 + %9967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 15, !dbg !372 + %9968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 16, !dbg !372 + %9969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 17, !dbg !372 + %9970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 18, !dbg !372 + %9971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 19, !dbg !372 + %9972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 20, !dbg !372 + %9973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 21, !dbg !372 + %9974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 22, !dbg !372 + %9975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 23, !dbg !372 + %9976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 24, !dbg !372 + %9977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 25, !dbg !372 + %9978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 26, !dbg !372 + %9979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 27, !dbg !372 + %9980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 28, !dbg !372 + %9981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 29, !dbg !372 + %9982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 30, !dbg !372 + %9983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 31, !dbg !372 + %9984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 32, !dbg !372 + %9985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 33, !dbg !372 + %9986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 34, !dbg !372 + %9987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 35, !dbg !372 + %9988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 36, !dbg !372 + %9989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 37, !dbg !372 + %9990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 38, !dbg !372 + %9991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 39, !dbg !372 + %9992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 40, !dbg !372 + %9993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 41, !dbg !372 + %9994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 42, !dbg !372 + %9995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 43, !dbg !372 + %9996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 44, !dbg !372 + %9997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 45, !dbg !372 + %9998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 46, !dbg !372 + %9999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 47, !dbg !372 + %10000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 48, !dbg !372 + %10001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 49, !dbg !372 + %10002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 50, !dbg !372 + %10003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 51, !dbg !372 + %10004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 52, !dbg !372 + %10005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 53, !dbg !372 + %10006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 54, !dbg !372 + %10007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 55, !dbg !372 + %10008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 56, !dbg !372 + %10009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 57, !dbg !372 + %10010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 58, !dbg !372 + %10011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 59, !dbg !372 + %10012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 60, !dbg !372 + %10013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 61, !dbg !372 + %10014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 62, !dbg !372 + %10015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 63, !dbg !372 + %10016 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9952, float %9953, float %9954, float %9955, float %9956, float %9957, float %9958, float %9959, float %9960, float %9961, float %9962, float %9963, float %9964, float %9965, float %9966, float %9967, float %9968, float %9969, float %9970, float %9971, float %9972, float %9973, float %9974, float %9975, float %9976, float %9977, float %9978, float %9979, float %9980, float %9981, float %9982, float %9983, float %9984, float %9985, float %9986, float %9987, float %9988, float %9989, float %9990, float %9991, float %9992, float %9993, float %9994, float %9995, float %9996, float %9997, float %9998, float %9999, float %10000, float %10001, float %10002, float %10003, float %10004, float %10005, float %10006, float %10007, float %10008, float %10009, float %10010, float %10011, float %10012, float %10013, float %10014, float %10015, i32 %9863, i32 %9864, i32 %9865, i32 %9866, i64 %9951, i1 true) #3, !dbg !372 + %10017 = add i32 %9871, 6144, !dbg !372 + %10018 = lshr exact i32 %10017, 4, !dbg !372 + %10019 = and i32 %10018, 16383, !dbg !372 + %10020 = zext nneg i32 %10019 to i64, !dbg !372 + %10021 = or disjoint i64 %10020, 4611686293338849280, !dbg !372 + %10022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 0, !dbg !372 + %10023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 1, !dbg !372 + %10024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 2, !dbg !372 + %10025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 3, !dbg !372 + %10026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 4, !dbg !372 + %10027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 5, !dbg !372 + %10028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 6, !dbg !372 + %10029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 7, !dbg !372 + %10030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 8, !dbg !372 + %10031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 9, !dbg !372 + %10032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 10, !dbg !372 + %10033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 11, !dbg !372 + %10034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 12, !dbg !372 + %10035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 13, !dbg !372 + %10036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 14, !dbg !372 + %10037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 15, !dbg !372 + %10038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 16, !dbg !372 + %10039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 17, !dbg !372 + %10040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 18, !dbg !372 + %10041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 19, !dbg !372 + %10042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 20, !dbg !372 + %10043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 21, !dbg !372 + %10044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 22, !dbg !372 + %10045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 23, !dbg !372 + %10046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 24, !dbg !372 + %10047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 25, !dbg !372 + %10048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 26, !dbg !372 + %10049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 27, !dbg !372 + %10050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 28, !dbg !372 + %10051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 29, !dbg !372 + %10052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 30, !dbg !372 + %10053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 31, !dbg !372 + %10054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 32, !dbg !372 + %10055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 33, !dbg !372 + %10056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 34, !dbg !372 + %10057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 35, !dbg !372 + %10058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 36, !dbg !372 + %10059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 37, !dbg !372 + %10060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 38, !dbg !372 + %10061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 39, !dbg !372 + %10062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 40, !dbg !372 + %10063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 41, !dbg !372 + %10064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 42, !dbg !372 + %10065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 43, !dbg !372 + %10066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 44, !dbg !372 + %10067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 45, !dbg !372 + %10068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 46, !dbg !372 + %10069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 47, !dbg !372 + %10070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 48, !dbg !372 + %10071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 49, !dbg !372 + %10072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 50, !dbg !372 + %10073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 51, !dbg !372 + %10074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 52, !dbg !372 + %10075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 53, !dbg !372 + %10076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 54, !dbg !372 + %10077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 55, !dbg !372 + %10078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 56, !dbg !372 + %10079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 57, !dbg !372 + %10080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 58, !dbg !372 + %10081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 59, !dbg !372 + %10082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 60, !dbg !372 + %10083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 61, !dbg !372 + %10084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 62, !dbg !372 + %10085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 63, !dbg !372 + %10086 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10022, float %10023, float %10024, float %10025, float %10026, float %10027, float %10028, float %10029, float %10030, float %10031, float %10032, float %10033, float %10034, float %10035, float %10036, float %10037, float %10038, float %10039, float %10040, float %10041, float %10042, float %10043, float %10044, float %10045, float %10046, float %10047, float %10048, float %10049, float %10050, float %10051, float %10052, float %10053, float %10054, float %10055, float %10056, float %10057, float %10058, float %10059, float %10060, float %10061, float %10062, float %10063, float %10064, float %10065, float %10066, float %10067, float %10068, float %10069, float %10070, float %10071, float %10072, float %10073, float %10074, float %10075, float %10076, float %10077, float %10078, float %10079, float %10080, float %10081, float %10082, float %10083, float %10084, float %10085, i32 %9867, i32 %9868, i32 %9869, i32 %9870, i64 %10021, i1 true) #3, !dbg !372 + %10087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 0, !dbg !372 + %10088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 1, !dbg !372 + %10089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 2, !dbg !372 + %10090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 3, !dbg !372 + %10091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 4, !dbg !372 + %10092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 5, !dbg !372 + %10093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 6, !dbg !372 + %10094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 7, !dbg !372 + %10095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 8, !dbg !372 + %10096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 9, !dbg !372 + %10097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 10, !dbg !372 + %10098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 11, !dbg !372 + %10099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 12, !dbg !372 + %10100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 13, !dbg !372 + %10101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 14, !dbg !372 + %10102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 15, !dbg !372 + %10103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 16, !dbg !372 + %10104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 17, !dbg !372 + %10105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 18, !dbg !372 + %10106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 19, !dbg !372 + %10107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 20, !dbg !372 + %10108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 21, !dbg !372 + %10109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 22, !dbg !372 + %10110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 23, !dbg !372 + %10111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 24, !dbg !372 + %10112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 25, !dbg !372 + %10113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 26, !dbg !372 + %10114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 27, !dbg !372 + %10115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 28, !dbg !372 + %10116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 29, !dbg !372 + %10117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 30, !dbg !372 + %10118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 31, !dbg !372 + %10119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 32, !dbg !372 + %10120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 33, !dbg !372 + %10121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 34, !dbg !372 + %10122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 35, !dbg !372 + %10123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 36, !dbg !372 + %10124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 37, !dbg !372 + %10125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 38, !dbg !372 + %10126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 39, !dbg !372 + %10127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 40, !dbg !372 + %10128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 41, !dbg !372 + %10129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 42, !dbg !372 + %10130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 43, !dbg !372 + %10131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 44, !dbg !372 + %10132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 45, !dbg !372 + %10133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 46, !dbg !372 + %10134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 47, !dbg !372 + %10135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 48, !dbg !372 + %10136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 49, !dbg !372 + %10137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 50, !dbg !372 + %10138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 51, !dbg !372 + %10139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 52, !dbg !372 + %10140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 53, !dbg !372 + %10141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 54, !dbg !372 + %10142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 55, !dbg !372 + %10143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 56, !dbg !372 + %10144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 57, !dbg !372 + %10145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 58, !dbg !372 + %10146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 59, !dbg !372 + %10147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 60, !dbg !372 + %10148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 61, !dbg !372 + %10149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 62, !dbg !372 + %10150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 63, !dbg !372 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !372 + %10151 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %9065, !dbg !358 + %10152 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5332, !dbg !358 + %10153 = load float, ptr addrspace(3) %10152, align 8, !dbg !358 + %10154 = getelementptr inbounds nuw i8, ptr addrspace(3) %10152, i32 4, !dbg !358 + %10155 = load float, ptr addrspace(3) %10154, align 4, !dbg !358 + %10156 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5336, !dbg !358 + %10157 = load float, ptr addrspace(3) %10156, align 8, !dbg !358 + %10158 = getelementptr inbounds nuw i8, ptr addrspace(3) %10156, i32 4, !dbg !358 + %10159 = load float, ptr addrspace(3) %10158, align 4, !dbg !358 + %10160 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5342, !dbg !358 + %10161 = load float, ptr addrspace(3) %10160, align 8, !dbg !358 + %10162 = getelementptr inbounds nuw i8, ptr addrspace(3) %10160, i32 4, !dbg !358 + %10163 = load float, ptr addrspace(3) %10162, align 4, !dbg !358 + %10164 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5348, !dbg !358 + %10165 = load float, ptr addrspace(3) %10164, align 8, !dbg !358 + %10166 = getelementptr inbounds nuw i8, ptr addrspace(3) %10164, i32 4, !dbg !358 + %10167 = load float, ptr addrspace(3) %10166, align 4, !dbg !358 + %10168 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5354, !dbg !358 + %10169 = load float, ptr addrspace(3) %10168, align 8, !dbg !358 + %10170 = getelementptr inbounds nuw i8, ptr addrspace(3) %10168, i32 4, !dbg !358 + %10171 = load float, ptr addrspace(3) %10170, align 4, !dbg !358 + %10172 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5360, !dbg !358 + %10173 = load float, ptr addrspace(3) %10172, align 8, !dbg !358 + %10174 = getelementptr inbounds nuw i8, ptr addrspace(3) %10172, i32 4, !dbg !358 + %10175 = load float, ptr addrspace(3) %10174, align 4, !dbg !358 + %10176 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5366, !dbg !358 + %10177 = load float, ptr addrspace(3) %10176, align 8, !dbg !358 + %10178 = getelementptr inbounds nuw i8, ptr addrspace(3) %10176, i32 4, !dbg !358 + %10179 = load float, ptr addrspace(3) %10178, align 4, !dbg !358 + %10180 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5372, !dbg !358 + %10181 = load float, ptr addrspace(3) %10180, align 8, !dbg !358 + %10182 = getelementptr inbounds nuw i8, ptr addrspace(3) %10180, i32 4, !dbg !358 + %10183 = load float, ptr addrspace(3) %10182, align 4, !dbg !358 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !373 + %10184 = add i32 %9133, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10185 = lshr exact i32 %10184, 4, !dbg !373 + %10186 = and i32 %10185, 16383, !dbg !373 + %10187 = zext nneg i32 %10186 to i64, !dbg !373 + %10188 = or disjoint i64 %10187, 4611686293372403712, !dbg !373 + %10189 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %10188, i64 %9875) #3, !dbg !373 + %10190 = add i32 %9145, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10191 = lshr exact i32 %10190, 4, !dbg !373 + %10192 = and i32 %10191, 16383, !dbg !373 + %10193 = zext nneg i32 %10192 to i64, !dbg !373 + %10194 = or disjoint i64 %10193, 4611686293372403712, !dbg !373 + %10195 = add i32 %9871, 32, !dbg !373 + %10196 = lshr exact i32 %10195, 4, !dbg !373 + %10197 = and i32 %10196, 16383, !dbg !373 + %10198 = zext nneg i32 %10197 to i64, !dbg !373 + %10199 = or disjoint i64 %10198, 4611686293338849280, !dbg !373 + %10200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 0, !dbg !373 + %10201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 1, !dbg !373 + %10202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 2, !dbg !373 + %10203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 3, !dbg !373 + %10204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 4, !dbg !373 + %10205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 5, !dbg !373 + %10206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 6, !dbg !373 + %10207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 7, !dbg !373 + %10208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 8, !dbg !373 + %10209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 9, !dbg !373 + %10210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 10, !dbg !373 + %10211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 11, !dbg !373 + %10212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 12, !dbg !373 + %10213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 13, !dbg !373 + %10214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 14, !dbg !373 + %10215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 15, !dbg !373 + %10216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 16, !dbg !373 + %10217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 17, !dbg !373 + %10218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 18, !dbg !373 + %10219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 19, !dbg !373 + %10220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 20, !dbg !373 + %10221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 21, !dbg !373 + %10222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 22, !dbg !373 + %10223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 23, !dbg !373 + %10224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 24, !dbg !373 + %10225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 25, !dbg !373 + %10226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 26, !dbg !373 + %10227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 27, !dbg !373 + %10228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 28, !dbg !373 + %10229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 29, !dbg !373 + %10230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 30, !dbg !373 + %10231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 31, !dbg !373 + %10232 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10200, float %10201, float %10202, float %10203, float %10204, float %10205, float %10206, float %10207, float %10208, float %10209, float %10210, float %10211, float %10212, float %10213, float %10214, float %10215, float %10216, float %10217, float %10218, float %10219, float %10220, float %10221, float %10222, float %10223, float %10224, float %10225, float %10226, float %10227, float %10228, float %10229, float %10230, float %10231, i64 %10194, i64 %10199, i1 true) #3, !dbg !373 + %10233 = add i32 %9189, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10234 = lshr exact i32 %10233, 4, !dbg !373 + %10235 = and i32 %10234, 16383, !dbg !373 + %10236 = zext nneg i32 %10235 to i64, !dbg !373 + %10237 = or disjoint i64 %10236, 4611686293372403712, !dbg !373 + %10238 = add i32 %9871, 64, !dbg !373 + %10239 = lshr exact i32 %10238, 4, !dbg !373 + %10240 = and i32 %10239, 16383, !dbg !373 + %10241 = zext nneg i32 %10240 to i64, !dbg !373 + %10242 = or disjoint i64 %10241, 4611686293338849280, !dbg !373 + %10243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 0, !dbg !373 + %10244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 1, !dbg !373 + %10245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 2, !dbg !373 + %10246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 3, !dbg !373 + %10247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 4, !dbg !373 + %10248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 5, !dbg !373 + %10249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 6, !dbg !373 + %10250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 7, !dbg !373 + %10251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 8, !dbg !373 + %10252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 9, !dbg !373 + %10253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 10, !dbg !373 + %10254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 11, !dbg !373 + %10255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 12, !dbg !373 + %10256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 13, !dbg !373 + %10257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 14, !dbg !373 + %10258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 15, !dbg !373 + %10259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 16, !dbg !373 + %10260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 17, !dbg !373 + %10261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 18, !dbg !373 + %10262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 19, !dbg !373 + %10263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 20, !dbg !373 + %10264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 21, !dbg !373 + %10265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 22, !dbg !373 + %10266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 23, !dbg !373 + %10267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 24, !dbg !373 + %10268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 25, !dbg !373 + %10269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 26, !dbg !373 + %10270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 27, !dbg !373 + %10271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 28, !dbg !373 + %10272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 29, !dbg !373 + %10273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 30, !dbg !373 + %10274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 31, !dbg !373 + %10275 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10243, float %10244, float %10245, float %10246, float %10247, float %10248, float %10249, float %10250, float %10251, float %10252, float %10253, float %10254, float %10255, float %10256, float %10257, float %10258, float %10259, float %10260, float %10261, float %10262, float %10263, float %10264, float %10265, float %10266, float %10267, float %10268, float %10269, float %10270, float %10271, float %10272, float %10273, float %10274, i64 %10237, i64 %10242, i1 true) #3, !dbg !373 + %10276 = add i32 %9233, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10277 = lshr exact i32 %10276, 4, !dbg !373 + %10278 = and i32 %10277, 16383, !dbg !373 + %10279 = zext nneg i32 %10278 to i64, !dbg !373 + %10280 = or disjoint i64 %10279, 4611686293372403712, !dbg !373 + %10281 = add i32 %9871, 96, !dbg !373 + %10282 = lshr exact i32 %10281, 4, !dbg !373 + %10283 = and i32 %10282, 16383, !dbg !373 + %10284 = zext nneg i32 %10283 to i64, !dbg !373 + %10285 = or disjoint i64 %10284, 4611686293338849280, !dbg !373 + %10286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 0, !dbg !373 + %10287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 1, !dbg !373 + %10288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 2, !dbg !373 + %10289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 3, !dbg !373 + %10290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 4, !dbg !373 + %10291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 5, !dbg !373 + %10292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 6, !dbg !373 + %10293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 7, !dbg !373 + %10294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 8, !dbg !373 + %10295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 9, !dbg !373 + %10296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 10, !dbg !373 + %10297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 11, !dbg !373 + %10298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 12, !dbg !373 + %10299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 13, !dbg !373 + %10300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 14, !dbg !373 + %10301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 15, !dbg !373 + %10302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 16, !dbg !373 + %10303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 17, !dbg !373 + %10304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 18, !dbg !373 + %10305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 19, !dbg !373 + %10306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 20, !dbg !373 + %10307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 21, !dbg !373 + %10308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 22, !dbg !373 + %10309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 23, !dbg !373 + %10310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 24, !dbg !373 + %10311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 25, !dbg !373 + %10312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 26, !dbg !373 + %10313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 27, !dbg !373 + %10314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 28, !dbg !373 + %10315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 29, !dbg !373 + %10316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 30, !dbg !373 + %10317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 31, !dbg !373 + %10318 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10286, float %10287, float %10288, float %10289, float %10290, float %10291, float %10292, float %10293, float %10294, float %10295, float %10296, float %10297, float %10298, float %10299, float %10300, float %10301, float %10302, float %10303, float %10304, float %10305, float %10306, float %10307, float %10308, float %10309, float %10310, float %10311, float %10312, float %10313, float %10314, float %10315, float %10316, float %10317, i64 %10280, i64 %10285, i1 true) #3, !dbg !373 + %10319 = add i32 %9277, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10320 = lshr exact i32 %10319, 4, !dbg !373 + %10321 = and i32 %10320, 16383, !dbg !373 + %10322 = zext nneg i32 %10321 to i64, !dbg !373 + %10323 = or disjoint i64 %10322, 4611686293372403712, !dbg !373 + %10324 = add i32 %9871, 8192, !dbg !373 + %10325 = lshr exact i32 %10324, 4, !dbg !373 + %10326 = and i32 %10325, 16383, !dbg !373 + %10327 = zext nneg i32 %10326 to i64, !dbg !373 + %10328 = or disjoint i64 %10327, 4611686293338849280, !dbg !373 + %10329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 0, !dbg !373 + %10330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 1, !dbg !373 + %10331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 2, !dbg !373 + %10332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 3, !dbg !373 + %10333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 4, !dbg !373 + %10334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 5, !dbg !373 + %10335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 6, !dbg !373 + %10336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 7, !dbg !373 + %10337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 8, !dbg !373 + %10338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 9, !dbg !373 + %10339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 10, !dbg !373 + %10340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 11, !dbg !373 + %10341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 12, !dbg !373 + %10342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 13, !dbg !373 + %10343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 14, !dbg !373 + %10344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 15, !dbg !373 + %10345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 16, !dbg !373 + %10346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 17, !dbg !373 + %10347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 18, !dbg !373 + %10348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 19, !dbg !373 + %10349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 20, !dbg !373 + %10350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 21, !dbg !373 + %10351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 22, !dbg !373 + %10352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 23, !dbg !373 + %10353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 24, !dbg !373 + %10354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 25, !dbg !373 + %10355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 26, !dbg !373 + %10356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 27, !dbg !373 + %10357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 28, !dbg !373 + %10358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 29, !dbg !373 + %10359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 30, !dbg !373 + %10360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 31, !dbg !373 + %10361 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10329, float %10330, float %10331, float %10332, float %10333, float %10334, float %10335, float %10336, float %10337, float %10338, float %10339, float %10340, float %10341, float %10342, float %10343, float %10344, float %10345, float %10346, float %10347, float %10348, float %10349, float %10350, float %10351, float %10352, float %10353, float %10354, float %10355, float %10356, float %10357, float %10358, float %10359, float %10360, i64 %10323, i64 %10328, i1 true) #3, !dbg !373 + %10362 = add i32 %9321, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10363 = lshr exact i32 %10362, 4, !dbg !373 + %10364 = and i32 %10363, 16383, !dbg !373 + %10365 = zext nneg i32 %10364 to i64, !dbg !373 + %10366 = or disjoint i64 %10365, 4611686293372403712, !dbg !373 + %10367 = add i32 %9871, 8224, !dbg !373 + %10368 = lshr exact i32 %10367, 4, !dbg !373 + %10369 = and i32 %10368, 16383, !dbg !373 + %10370 = zext nneg i32 %10369 to i64, !dbg !373 + %10371 = or disjoint i64 %10370, 4611686293338849280, !dbg !373 + %10372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 0, !dbg !373 + %10373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 1, !dbg !373 + %10374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 2, !dbg !373 + %10375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 3, !dbg !373 + %10376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 4, !dbg !373 + %10377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 5, !dbg !373 + %10378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 6, !dbg !373 + %10379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 7, !dbg !373 + %10380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 8, !dbg !373 + %10381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 9, !dbg !373 + %10382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 10, !dbg !373 + %10383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 11, !dbg !373 + %10384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 12, !dbg !373 + %10385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 13, !dbg !373 + %10386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 14, !dbg !373 + %10387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 15, !dbg !373 + %10388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 16, !dbg !373 + %10389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 17, !dbg !373 + %10390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 18, !dbg !373 + %10391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 19, !dbg !373 + %10392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 20, !dbg !373 + %10393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 21, !dbg !373 + %10394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 22, !dbg !373 + %10395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 23, !dbg !373 + %10396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 24, !dbg !373 + %10397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 25, !dbg !373 + %10398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 26, !dbg !373 + %10399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 27, !dbg !373 + %10400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 28, !dbg !373 + %10401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 29, !dbg !373 + %10402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 30, !dbg !373 + %10403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 31, !dbg !373 + %10404 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10372, float %10373, float %10374, float %10375, float %10376, float %10377, float %10378, float %10379, float %10380, float %10381, float %10382, float %10383, float %10384, float %10385, float %10386, float %10387, float %10388, float %10389, float %10390, float %10391, float %10392, float %10393, float %10394, float %10395, float %10396, float %10397, float %10398, float %10399, float %10400, float %10401, float %10402, float %10403, i64 %10366, i64 %10371, i1 true) #3, !dbg !373 + %10405 = add i32 %9365, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10406 = lshr exact i32 %10405, 4, !dbg !373 + %10407 = and i32 %10406, 16383, !dbg !373 + %10408 = zext nneg i32 %10407 to i64, !dbg !373 + %10409 = or disjoint i64 %10408, 4611686293372403712, !dbg !373 + %10410 = add i32 %9871, 8256, !dbg !373 + %10411 = lshr exact i32 %10410, 4, !dbg !373 + %10412 = and i32 %10411, 16383, !dbg !373 + %10413 = zext nneg i32 %10412 to i64, !dbg !373 + %10414 = or disjoint i64 %10413, 4611686293338849280, !dbg !373 + %10415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 0, !dbg !373 + %10416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 1, !dbg !373 + %10417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 2, !dbg !373 + %10418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 3, !dbg !373 + %10419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 4, !dbg !373 + %10420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 5, !dbg !373 + %10421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 6, !dbg !373 + %10422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 7, !dbg !373 + %10423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 8, !dbg !373 + %10424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 9, !dbg !373 + %10425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 10, !dbg !373 + %10426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 11, !dbg !373 + %10427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 12, !dbg !373 + %10428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 13, !dbg !373 + %10429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 14, !dbg !373 + %10430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 15, !dbg !373 + %10431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 16, !dbg !373 + %10432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 17, !dbg !373 + %10433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 18, !dbg !373 + %10434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 19, !dbg !373 + %10435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 20, !dbg !373 + %10436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 21, !dbg !373 + %10437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 22, !dbg !373 + %10438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 23, !dbg !373 + %10439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 24, !dbg !373 + %10440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 25, !dbg !373 + %10441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 26, !dbg !373 + %10442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 27, !dbg !373 + %10443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 28, !dbg !373 + %10444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 29, !dbg !373 + %10445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 30, !dbg !373 + %10446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 31, !dbg !373 + %10447 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10415, float %10416, float %10417, float %10418, float %10419, float %10420, float %10421, float %10422, float %10423, float %10424, float %10425, float %10426, float %10427, float %10428, float %10429, float %10430, float %10431, float %10432, float %10433, float %10434, float %10435, float %10436, float %10437, float %10438, float %10439, float %10440, float %10441, float %10442, float %10443, float %10444, float %10445, float %10446, i64 %10409, i64 %10414, i1 true) #3, !dbg !373 + %10448 = add i32 %9409, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10449 = lshr exact i32 %10448, 4, !dbg !373 + %10450 = and i32 %10449, 16383, !dbg !373 + %10451 = zext nneg i32 %10450 to i64, !dbg !373 + %10452 = or disjoint i64 %10451, 4611686293372403712, !dbg !373 + %10453 = add i32 %9871, 8288, !dbg !373 + %10454 = lshr exact i32 %10453, 4, !dbg !373 + %10455 = and i32 %10454, 16383, !dbg !373 + %10456 = zext nneg i32 %10455 to i64, !dbg !373 + %10457 = or disjoint i64 %10456, 4611686293338849280, !dbg !373 + %10458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 0, !dbg !373 + %10459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 1, !dbg !373 + %10460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 2, !dbg !373 + %10461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 3, !dbg !373 + %10462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 4, !dbg !373 + %10463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 5, !dbg !373 + %10464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 6, !dbg !373 + %10465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 7, !dbg !373 + %10466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 8, !dbg !373 + %10467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 9, !dbg !373 + %10468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 10, !dbg !373 + %10469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 11, !dbg !373 + %10470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 12, !dbg !373 + %10471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 13, !dbg !373 + %10472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 14, !dbg !373 + %10473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 15, !dbg !373 + %10474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 16, !dbg !373 + %10475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 17, !dbg !373 + %10476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 18, !dbg !373 + %10477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 19, !dbg !373 + %10478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 20, !dbg !373 + %10479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 21, !dbg !373 + %10480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 22, !dbg !373 + %10481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 23, !dbg !373 + %10482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 24, !dbg !373 + %10483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 25, !dbg !373 + %10484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 26, !dbg !373 + %10485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 27, !dbg !373 + %10486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 28, !dbg !373 + %10487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 29, !dbg !373 + %10488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 30, !dbg !373 + %10489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 31, !dbg !373 + %10490 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10458, float %10459, float %10460, float %10461, float %10462, float %10463, float %10464, float %10465, float %10466, float %10467, float %10468, float %10469, float %10470, float %10471, float %10472, float %10473, float %10474, float %10475, float %10476, float %10477, float %10478, float %10479, float %10480, float %10481, float %10482, float %10483, float %10484, float %10485, float %10486, float %10487, float %10488, float %10489, i64 %10452, i64 %10457, i1 true) #3, !dbg !373 + %10491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 0, !dbg !373 + %10492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 1, !dbg !373 + %10493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 2, !dbg !373 + %10494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 3, !dbg !373 + %10495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 4, !dbg !373 + %10496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 5, !dbg !373 + %10497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 6, !dbg !373 + %10498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 7, !dbg !373 + %10499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 8, !dbg !373 + %10500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 9, !dbg !373 + %10501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 10, !dbg !373 + %10502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 11, !dbg !373 + %10503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 12, !dbg !373 + %10504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 13, !dbg !373 + %10505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 14, !dbg !373 + %10506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 15, !dbg !373 + %10507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 16, !dbg !373 + %10508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 17, !dbg !373 + %10509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 18, !dbg !373 + %10510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 19, !dbg !373 + %10511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 20, !dbg !373 + %10512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 21, !dbg !373 + %10513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 22, !dbg !373 + %10514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 23, !dbg !373 + %10515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 24, !dbg !373 + %10516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 25, !dbg !373 + %10517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 26, !dbg !373 + %10518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 27, !dbg !373 + %10519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 28, !dbg !373 + %10520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 29, !dbg !373 + %10521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 30, !dbg !373 + %10522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 31, !dbg !373 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !373 + %10523 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %10491, float %10492, float %10493, float %10494, float %10495, float %10496, float %10497, float %10498, float %10499, float %10500, float %10501, float %10502, float %10503, float %10504, float %10505, float %10506, float %10507, float %10508, float %10509, float %10510, float %10511, float %10512, float %10513, float %10514, float %10515, float %10516, float %10517, float %10518, float %10519, float %10520, float %10521, float %10522, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %9806, i32 0, i32 0) #3, !dbg !373 + %10524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 0, !dbg !373 + %10525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 1, !dbg !373 + %10526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 2, !dbg !373 + %10527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 3, !dbg !373 + %10528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 4, !dbg !373 + %10529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 5, !dbg !373 + %10530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 6, !dbg !373 + %10531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 7, !dbg !373 + %10532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 8, !dbg !373 + %10533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 9, !dbg !373 + %10534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 10, !dbg !373 + %10535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 11, !dbg !373 + %10536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 12, !dbg !373 + %10537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 13, !dbg !373 + %10538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 14, !dbg !373 + %10539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 15, !dbg !373 + %10540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 16, !dbg !373 + %10541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 17, !dbg !373 + %10542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 18, !dbg !373 + %10543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 19, !dbg !373 + %10544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 20, !dbg !373 + %10545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 21, !dbg !373 + %10546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 22, !dbg !373 + %10547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 23, !dbg !373 + %10548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 24, !dbg !373 + %10549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 25, !dbg !373 + %10550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 26, !dbg !373 + %10551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 27, !dbg !373 + %10552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 28, !dbg !373 + %10553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 29, !dbg !373 + %10554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 30, !dbg !373 + %10555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 31, !dbg !373 + %10556 = fsub float %10524, %10153, !dbg !374 + %10557 = fsub float %10525, %10155, !dbg !374 + %10558 = fsub float %10526, %10153, !dbg !374 + %10559 = fsub float %10527, %10155, !dbg !374 + %10560 = fsub float %10528, %10157, !dbg !374 + %10561 = fsub float %10529, %10159, !dbg !374 + %10562 = fsub float %10530, %10157, !dbg !374 + %10563 = fsub float %10531, %10159, !dbg !374 + %10564 = fsub float %10532, %10161, !dbg !374 + %10565 = fsub float %10533, %10163, !dbg !374 + %10566 = fsub float %10534, %10161, !dbg !374 + %10567 = fsub float %10535, %10163, !dbg !374 + %10568 = fsub float %10536, %10165, !dbg !374 + %10569 = fsub float %10537, %10167, !dbg !374 + %10570 = fsub float %10538, %10165, !dbg !374 + %10571 = fsub float %10539, %10167, !dbg !374 + %10572 = fsub float %10540, %10169, !dbg !374 + %10573 = fsub float %10541, %10171, !dbg !374 + %10574 = fsub float %10542, %10169, !dbg !374 + %10575 = fsub float %10543, %10171, !dbg !374 + %10576 = fsub float %10544, %10173, !dbg !374 + %10577 = fsub float %10545, %10175, !dbg !374 + %10578 = fsub float %10546, %10173, !dbg !374 + %10579 = fsub float %10547, %10175, !dbg !374 + %10580 = fsub float %10548, %10177, !dbg !374 + %10581 = fsub float %10549, %10179, !dbg !374 + %10582 = fsub float %10550, %10177, !dbg !374 + %10583 = fsub float %10551, %10179, !dbg !374 + %10584 = fsub float %10552, %10181, !dbg !374 + %10585 = fsub float %10553, %10183, !dbg !374 + %10586 = fsub float %10554, %10181, !dbg !374 + %10587 = fsub float %10555, %10183, !dbg !374 + %10588 = fmul float %.0.i, %10556, !dbg !375 + %10589 = fmul float %.0.i1148, %10557, !dbg !375 + %10590 = fmul float %.0.i1151, %10558, !dbg !375 + %10591 = fmul float %.0.i1154, %10559, !dbg !375 + %10592 = fmul float %.0.i1157, %10560, !dbg !375 + %10593 = fmul float %.0.i1160, %10561, !dbg !375 + %10594 = fmul float %.0.i1163, %10562, !dbg !375 + %10595 = fmul float %.0.i1166, %10563, !dbg !375 + %10596 = fmul float %.0.i1169, %10564, !dbg !375 + %10597 = fmul float %.0.i1172, %10565, !dbg !375 + %10598 = fmul float %.0.i1175, %10566, !dbg !375 + %10599 = fmul float %.0.i1178, %10567, !dbg !375 + %10600 = fmul float %.0.i1181, %10568, !dbg !375 + %10601 = fmul float %.0.i1184, %10569, !dbg !375 + %10602 = fmul float %.0.i1187, %10570, !dbg !375 + %10603 = fmul float %.0.i1190, %10571, !dbg !375 + %10604 = fmul float %.0.i1193, %10572, !dbg !375 + %10605 = fmul float %.0.i1196, %10573, !dbg !375 + %10606 = fmul float %.0.i1199, %10574, !dbg !375 + %10607 = fmul float %.0.i1202, %10575, !dbg !375 + %10608 = fmul float %.0.i1205, %10576, !dbg !375 + %10609 = fmul float %.0.i1208, %10577, !dbg !375 + %10610 = fmul float %.0.i1211, %10578, !dbg !375 + %10611 = fmul float %.0.i1214, %10579, !dbg !375 + %10612 = fmul float %.0.i1217, %10580, !dbg !375 + %10613 = fmul float %.0.i1220, %10581, !dbg !375 + %10614 = fmul float %.0.i1223, %10582, !dbg !375 + %10615 = fmul float %.0.i1226, %10583, !dbg !375 + %10616 = fmul float %.0.i1229, %10584, !dbg !375 + %10617 = fmul float %.0.i1232, %10585, !dbg !375 + %10618 = fmul float %.0.i1235, %10586, !dbg !375 + %10619 = fmul float %.0.i1238, %10587, !dbg !375 + %10620 = fptrunc float %10588 to bfloat, !dbg !376 + %10621 = select i1 %9047, bfloat %10620, bfloat 0xR0000, !dbg !377 + %10622 = fptrunc float %10589 to bfloat, !dbg !376 + %10623 = select i1 %9048, bfloat %10622, bfloat 0xR0000, !dbg !377 + %10624 = fptrunc float %10590 to bfloat, !dbg !376 + %10625 = select i1 %9047, bfloat %10624, bfloat 0xR0000, !dbg !377 + %10626 = fptrunc float %10591 to bfloat, !dbg !376 + %10627 = select i1 %9048, bfloat %10626, bfloat 0xR0000, !dbg !377 + %10628 = fptrunc float %10592 to bfloat, !dbg !376 + %10629 = select i1 %9049, bfloat %10628, bfloat 0xR0000, !dbg !377 + %10630 = fptrunc float %10593 to bfloat, !dbg !376 + %10631 = select i1 %9050, bfloat %10630, bfloat 0xR0000, !dbg !377 + %10632 = fptrunc float %10594 to bfloat, !dbg !376 + %10633 = select i1 %9049, bfloat %10632, bfloat 0xR0000, !dbg !377 + %10634 = fptrunc float %10595 to bfloat, !dbg !376 + %10635 = select i1 %9050, bfloat %10634, bfloat 0xR0000, !dbg !377 + %10636 = fptrunc float %10596 to bfloat, !dbg !376 + %10637 = select i1 %9051, bfloat %10636, bfloat 0xR0000, !dbg !377 + %10638 = fptrunc float %10597 to bfloat, !dbg !376 + %10639 = select i1 %9052, bfloat %10638, bfloat 0xR0000, !dbg !377 + %10640 = fptrunc float %10598 to bfloat, !dbg !376 + %10641 = select i1 %9051, bfloat %10640, bfloat 0xR0000, !dbg !377 + %10642 = fptrunc float %10599 to bfloat, !dbg !376 + %10643 = select i1 %9052, bfloat %10642, bfloat 0xR0000, !dbg !377 + %10644 = fptrunc float %10600 to bfloat, !dbg !376 + %10645 = select i1 %9053, bfloat %10644, bfloat 0xR0000, !dbg !377 + %10646 = fptrunc float %10601 to bfloat, !dbg !376 + %10647 = select i1 %9054, bfloat %10646, bfloat 0xR0000, !dbg !377 + %10648 = fptrunc float %10602 to bfloat, !dbg !376 + %10649 = select i1 %9053, bfloat %10648, bfloat 0xR0000, !dbg !377 + %10650 = fptrunc float %10603 to bfloat, !dbg !376 + %10651 = select i1 %9054, bfloat %10650, bfloat 0xR0000, !dbg !377 + %10652 = fptrunc float %10604 to bfloat, !dbg !376 + %10653 = select i1 %9055, bfloat %10652, bfloat 0xR0000, !dbg !377 + %10654 = fptrunc float %10605 to bfloat, !dbg !376 + %10655 = select i1 %9056, bfloat %10654, bfloat 0xR0000, !dbg !377 + %10656 = fptrunc float %10606 to bfloat, !dbg !376 + %10657 = select i1 %9055, bfloat %10656, bfloat 0xR0000, !dbg !377 + %10658 = fptrunc float %10607 to bfloat, !dbg !376 + %10659 = select i1 %9056, bfloat %10658, bfloat 0xR0000, !dbg !377 + %10660 = fptrunc float %10608 to bfloat, !dbg !376 + %10661 = select i1 %9057, bfloat %10660, bfloat 0xR0000, !dbg !377 + %10662 = fptrunc float %10609 to bfloat, !dbg !376 + %10663 = select i1 %9058, bfloat %10662, bfloat 0xR0000, !dbg !377 + %10664 = fptrunc float %10610 to bfloat, !dbg !376 + %10665 = select i1 %9057, bfloat %10664, bfloat 0xR0000, !dbg !377 + %10666 = fptrunc float %10611 to bfloat, !dbg !376 + %10667 = select i1 %9058, bfloat %10666, bfloat 0xR0000, !dbg !377 + %10668 = fptrunc float %10612 to bfloat, !dbg !376 + %10669 = select i1 %9059, bfloat %10668, bfloat 0xR0000, !dbg !377 + %10670 = fptrunc float %10613 to bfloat, !dbg !376 + %10671 = select i1 %9060, bfloat %10670, bfloat 0xR0000, !dbg !377 + %10672 = fptrunc float %10614 to bfloat, !dbg !376 + %10673 = select i1 %9059, bfloat %10672, bfloat 0xR0000, !dbg !377 + %10674 = fptrunc float %10615 to bfloat, !dbg !376 + %10675 = select i1 %9060, bfloat %10674, bfloat 0xR0000, !dbg !377 + %10676 = fptrunc float %10616 to bfloat, !dbg !376 + %10677 = select i1 %9061, bfloat %10676, bfloat 0xR0000, !dbg !377 + %10678 = fptrunc float %10617 to bfloat, !dbg !376 + %10679 = select i1 %9062, bfloat %10678, bfloat 0xR0000, !dbg !377 + %10680 = fptrunc float %10618 to bfloat, !dbg !376 + %10681 = select i1 %9061, bfloat %10680, bfloat 0xR0000, !dbg !377 + %10682 = fptrunc float %10619 to bfloat, !dbg !376 + %10683 = select i1 %9062, bfloat %10682, bfloat 0xR0000, !dbg !377 + %10684 = insertelement <2 x bfloat> poison, bfloat %10621, i64 0, !dbg !378 + %10685 = insertelement <2 x bfloat> %10684, bfloat %10623, i64 1, !dbg !378 + %10686 = bitcast <2 x bfloat> %10685 to i32, !dbg !378 + %10687 = insertelement <2 x bfloat> poison, bfloat %10625, i64 0, !dbg !378 + %10688 = insertelement <2 x bfloat> %10687, bfloat %10627, i64 1, !dbg !378 + %10689 = bitcast <2 x bfloat> %10688 to i32, !dbg !378 + %10690 = insertelement <2 x bfloat> poison, bfloat %10629, i64 0, !dbg !378 + %10691 = insertelement <2 x bfloat> %10690, bfloat %10631, i64 1, !dbg !378 + %10692 = bitcast <2 x bfloat> %10691 to i32, !dbg !378 + %10693 = insertelement <2 x bfloat> poison, bfloat %10633, i64 0, !dbg !378 + %10694 = insertelement <2 x bfloat> %10693, bfloat %10635, i64 1, !dbg !378 + %10695 = bitcast <2 x bfloat> %10694 to i32, !dbg !378 + %10696 = insertelement <2 x bfloat> poison, bfloat %10637, i64 0, !dbg !378 + %10697 = insertelement <2 x bfloat> %10696, bfloat %10639, i64 1, !dbg !378 + %10698 = bitcast <2 x bfloat> %10697 to i32, !dbg !378 + %10699 = insertelement <2 x bfloat> poison, bfloat %10641, i64 0, !dbg !378 + %10700 = insertelement <2 x bfloat> %10699, bfloat %10643, i64 1, !dbg !378 + %10701 = bitcast <2 x bfloat> %10700 to i32, !dbg !378 + %10702 = insertelement <2 x bfloat> poison, bfloat %10645, i64 0, !dbg !378 + %10703 = insertelement <2 x bfloat> %10702, bfloat %10647, i64 1, !dbg !378 + %10704 = bitcast <2 x bfloat> %10703 to i32, !dbg !378 + %10705 = insertelement <2 x bfloat> poison, bfloat %10649, i64 0, !dbg !378 + %10706 = insertelement <2 x bfloat> %10705, bfloat %10651, i64 1, !dbg !378 + %10707 = bitcast <2 x bfloat> %10706 to i32, !dbg !378 + %10708 = insertelement <2 x bfloat> poison, bfloat %10653, i64 0, !dbg !378 + %10709 = insertelement <2 x bfloat> %10708, bfloat %10655, i64 1, !dbg !378 + %10710 = bitcast <2 x bfloat> %10709 to i32, !dbg !378 + %10711 = insertelement <2 x bfloat> poison, bfloat %10657, i64 0, !dbg !378 + %10712 = insertelement <2 x bfloat> %10711, bfloat %10659, i64 1, !dbg !378 + %10713 = bitcast <2 x bfloat> %10712 to i32, !dbg !378 + %10714 = insertelement <2 x bfloat> poison, bfloat %10661, i64 0, !dbg !378 + %10715 = insertelement <2 x bfloat> %10714, bfloat %10663, i64 1, !dbg !378 + %10716 = bitcast <2 x bfloat> %10715 to i32, !dbg !378 + %10717 = insertelement <2 x bfloat> poison, bfloat %10665, i64 0, !dbg !378 + %10718 = insertelement <2 x bfloat> %10717, bfloat %10667, i64 1, !dbg !378 + %10719 = bitcast <2 x bfloat> %10718 to i32, !dbg !378 + %10720 = insertelement <2 x bfloat> poison, bfloat %10669, i64 0, !dbg !378 + %10721 = insertelement <2 x bfloat> %10720, bfloat %10671, i64 1, !dbg !378 + %10722 = bitcast <2 x bfloat> %10721 to i32, !dbg !378 + %10723 = insertelement <2 x bfloat> poison, bfloat %10673, i64 0, !dbg !378 + %10724 = insertelement <2 x bfloat> %10723, bfloat %10675, i64 1, !dbg !378 + %10725 = bitcast <2 x bfloat> %10724 to i32, !dbg !378 + %10726 = insertelement <2 x bfloat> poison, bfloat %10677, i64 0, !dbg !378 + %10727 = insertelement <2 x bfloat> %10726, bfloat %10679, i64 1, !dbg !378 + %10728 = bitcast <2 x bfloat> %10727 to i32, !dbg !378 + %10729 = insertelement <2 x bfloat> poison, bfloat %10681, i64 0, !dbg !378 + %10730 = insertelement <2 x bfloat> %10729, bfloat %10683, i64 1, !dbg !378 + %10731 = bitcast <2 x bfloat> %10730 to i32, !dbg !378 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !378 + %10732 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn3471624, float %.pn3451625, float %.pn3431626, float %.pn3411627, float %.pn3391628, float %.pn3371629, float %.pn3351630, float %.pn3331631, float %.pn3311632, float %.pn3291633, float %.pn3271634, float %.pn3251635, float %.pn3231636, float %.pn3211637, float %.pn3191638, float %.pn3171639, float %.pn3151640, float %.pn3131641, float %.pn3111642, float %.pn3091643, float %.pn3071644, float %.pn3051645, float %.pn3031646, float %.pn3011647, float %.pn2991648, float %.pn2971649, float %.pn2951650, float %.pn2931651, float %.pn2911652, float %.pn2891653, float %.pn2871654, float %.pn2851655, float %.pn2831656, float %.pn2811657, float %.pn2791658, float %.pn2771659, float %.pn2751660, float %.pn2731661, float %.pn2711662, float %.pn2691663, float %.pn2671664, float %.pn2651665, float %.pn2631666, float %.pn2611667, float %.pn2591668, float %.pn2571669, float %.pn2551670, float %.pn2531671, float %.pn2511672, float %.pn2491673, float %.pn2471674, float %.pn2451675, float %.pn2431676, float %.pn2411677, float %.pn2391678, float %.pn2371679, float %.pn2351680, float %.pn2331681, float %.pn2311682, float %.pn2291683, float %.pn2271684, float %.pn2251685, float %.pn2231686, float %.pn2211687, i32 %10686, i32 %10689, i32 %10692, i32 %10695, i64 %9143, i1 true) #3, !dbg !378 + %10733 = add i32 %9139, 2048, !dbg !378 + %10734 = lshr exact i32 %10733, 4, !dbg !378 + %10735 = and i32 %10734, 16383, !dbg !378 + %10736 = zext nneg i32 %10735 to i64, !dbg !378 + %10737 = or disjoint i64 %10736, 4611686293338849280, !dbg !378 + %10738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 0, !dbg !378 + %10739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 1, !dbg !378 + %10740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 2, !dbg !378 + %10741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 3, !dbg !378 + %10742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 4, !dbg !378 + %10743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 5, !dbg !378 + %10744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 6, !dbg !378 + %10745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 7, !dbg !378 + %10746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 8, !dbg !378 + %10747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 9, !dbg !378 + %10748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 10, !dbg !378 + %10749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 11, !dbg !378 + %10750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 12, !dbg !378 + %10751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 13, !dbg !378 + %10752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 14, !dbg !378 + %10753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 15, !dbg !378 + %10754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 16, !dbg !378 + %10755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 17, !dbg !378 + %10756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 18, !dbg !378 + %10757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 19, !dbg !378 + %10758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 20, !dbg !378 + %10759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 21, !dbg !378 + %10760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 22, !dbg !378 + %10761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 23, !dbg !378 + %10762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 24, !dbg !378 + %10763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 25, !dbg !378 + %10764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 26, !dbg !378 + %10765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 27, !dbg !378 + %10766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 28, !dbg !378 + %10767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 29, !dbg !378 + %10768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 30, !dbg !378 + %10769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 31, !dbg !378 + %10770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 32, !dbg !378 + %10771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 33, !dbg !378 + %10772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 34, !dbg !378 + %10773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 35, !dbg !378 + %10774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 36, !dbg !378 + %10775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 37, !dbg !378 + %10776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 38, !dbg !378 + %10777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 39, !dbg !378 + %10778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 40, !dbg !378 + %10779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 41, !dbg !378 + %10780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 42, !dbg !378 + %10781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 43, !dbg !378 + %10782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 44, !dbg !378 + %10783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 45, !dbg !378 + %10784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 46, !dbg !378 + %10785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 47, !dbg !378 + %10786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 48, !dbg !378 + %10787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 49, !dbg !378 + %10788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 50, !dbg !378 + %10789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 51, !dbg !378 + %10790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 52, !dbg !378 + %10791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 53, !dbg !378 + %10792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 54, !dbg !378 + %10793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 55, !dbg !378 + %10794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 56, !dbg !378 + %10795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 57, !dbg !378 + %10796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 58, !dbg !378 + %10797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 59, !dbg !378 + %10798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 60, !dbg !378 + %10799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 61, !dbg !378 + %10800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 62, !dbg !378 + %10801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 63, !dbg !378 + %10802 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10738, float %10739, float %10740, float %10741, float %10742, float %10743, float %10744, float %10745, float %10746, float %10747, float %10748, float %10749, float %10750, float %10751, float %10752, float %10753, float %10754, float %10755, float %10756, float %10757, float %10758, float %10759, float %10760, float %10761, float %10762, float %10763, float %10764, float %10765, float %10766, float %10767, float %10768, float %10769, float %10770, float %10771, float %10772, float %10773, float %10774, float %10775, float %10776, float %10777, float %10778, float %10779, float %10780, float %10781, float %10782, float %10783, float %10784, float %10785, float %10786, float %10787, float %10788, float %10789, float %10790, float %10791, float %10792, float %10793, float %10794, float %10795, float %10796, float %10797, float %10798, float %10799, float %10800, float %10801, i32 %10698, i32 %10701, i32 %10704, i32 %10707, i64 %10737, i1 true) #3, !dbg !378 + %10803 = add i32 %9139, 4096, !dbg !378 + %10804 = lshr exact i32 %10803, 4, !dbg !378 + %10805 = and i32 %10804, 16383, !dbg !378 + %10806 = zext nneg i32 %10805 to i64, !dbg !378 + %10807 = or disjoint i64 %10806, 4611686293338849280, !dbg !378 + %10808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 0, !dbg !378 + %10809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 1, !dbg !378 + %10810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 2, !dbg !378 + %10811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 3, !dbg !378 + %10812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 4, !dbg !378 + %10813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 5, !dbg !378 + %10814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 6, !dbg !378 + %10815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 7, !dbg !378 + %10816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 8, !dbg !378 + %10817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 9, !dbg !378 + %10818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 10, !dbg !378 + %10819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 11, !dbg !378 + %10820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 12, !dbg !378 + %10821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 13, !dbg !378 + %10822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 14, !dbg !378 + %10823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 15, !dbg !378 + %10824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 16, !dbg !378 + %10825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 17, !dbg !378 + %10826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 18, !dbg !378 + %10827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 19, !dbg !378 + %10828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 20, !dbg !378 + %10829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 21, !dbg !378 + %10830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 22, !dbg !378 + %10831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 23, !dbg !378 + %10832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 24, !dbg !378 + %10833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 25, !dbg !378 + %10834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 26, !dbg !378 + %10835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 27, !dbg !378 + %10836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 28, !dbg !378 + %10837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 29, !dbg !378 + %10838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 30, !dbg !378 + %10839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 31, !dbg !378 + %10840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 32, !dbg !378 + %10841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 33, !dbg !378 + %10842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 34, !dbg !378 + %10843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 35, !dbg !378 + %10844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 36, !dbg !378 + %10845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 37, !dbg !378 + %10846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 38, !dbg !378 + %10847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 39, !dbg !378 + %10848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 40, !dbg !378 + %10849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 41, !dbg !378 + %10850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 42, !dbg !378 + %10851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 43, !dbg !378 + %10852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 44, !dbg !378 + %10853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 45, !dbg !378 + %10854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 46, !dbg !378 + %10855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 47, !dbg !378 + %10856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 48, !dbg !378 + %10857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 49, !dbg !378 + %10858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 50, !dbg !378 + %10859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 51, !dbg !378 + %10860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 52, !dbg !378 + %10861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 53, !dbg !378 + %10862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 54, !dbg !378 + %10863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 55, !dbg !378 + %10864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 56, !dbg !378 + %10865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 57, !dbg !378 + %10866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 58, !dbg !378 + %10867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 59, !dbg !378 + %10868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 60, !dbg !378 + %10869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 61, !dbg !378 + %10870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 62, !dbg !378 + %10871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 63, !dbg !378 + %10872 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10808, float %10809, float %10810, float %10811, float %10812, float %10813, float %10814, float %10815, float %10816, float %10817, float %10818, float %10819, float %10820, float %10821, float %10822, float %10823, float %10824, float %10825, float %10826, float %10827, float %10828, float %10829, float %10830, float %10831, float %10832, float %10833, float %10834, float %10835, float %10836, float %10837, float %10838, float %10839, float %10840, float %10841, float %10842, float %10843, float %10844, float %10845, float %10846, float %10847, float %10848, float %10849, float %10850, float %10851, float %10852, float %10853, float %10854, float %10855, float %10856, float %10857, float %10858, float %10859, float %10860, float %10861, float %10862, float %10863, float %10864, float %10865, float %10866, float %10867, float %10868, float %10869, float %10870, float %10871, i32 %10710, i32 %10713, i32 %10716, i32 %10719, i64 %10807, i1 true) #3, !dbg !378 + %10873 = add i32 %9139, 6144, !dbg !378 + %10874 = lshr exact i32 %10873, 4, !dbg !378 + %10875 = and i32 %10874, 16383, !dbg !378 + %10876 = zext nneg i32 %10875 to i64, !dbg !378 + %10877 = or disjoint i64 %10876, 4611686293338849280, !dbg !378 + %10878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 0, !dbg !378 + %10879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 1, !dbg !378 + %10880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 2, !dbg !378 + %10881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 3, !dbg !378 + %10882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 4, !dbg !378 + %10883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 5, !dbg !378 + %10884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 6, !dbg !378 + %10885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 7, !dbg !378 + %10886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 8, !dbg !378 + %10887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 9, !dbg !378 + %10888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 10, !dbg !378 + %10889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 11, !dbg !378 + %10890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 12, !dbg !378 + %10891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 13, !dbg !378 + %10892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 14, !dbg !378 + %10893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 15, !dbg !378 + %10894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 16, !dbg !378 + %10895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 17, !dbg !378 + %10896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 18, !dbg !378 + %10897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 19, !dbg !378 + %10898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 20, !dbg !378 + %10899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 21, !dbg !378 + %10900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 22, !dbg !378 + %10901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 23, !dbg !378 + %10902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 24, !dbg !378 + %10903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 25, !dbg !378 + %10904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 26, !dbg !378 + %10905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 27, !dbg !378 + %10906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 28, !dbg !378 + %10907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 29, !dbg !378 + %10908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 30, !dbg !378 + %10909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 31, !dbg !378 + %10910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 32, !dbg !378 + %10911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 33, !dbg !378 + %10912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 34, !dbg !378 + %10913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 35, !dbg !378 + %10914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 36, !dbg !378 + %10915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 37, !dbg !378 + %10916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 38, !dbg !378 + %10917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 39, !dbg !378 + %10918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 40, !dbg !378 + %10919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 41, !dbg !378 + %10920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 42, !dbg !378 + %10921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 43, !dbg !378 + %10922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 44, !dbg !378 + %10923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 45, !dbg !378 + %10924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 46, !dbg !378 + %10925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 47, !dbg !378 + %10926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 48, !dbg !378 + %10927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 49, !dbg !378 + %10928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 50, !dbg !378 + %10929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 51, !dbg !378 + %10930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 52, !dbg !378 + %10931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 53, !dbg !378 + %10932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 54, !dbg !378 + %10933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 55, !dbg !378 + %10934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 56, !dbg !378 + %10935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 57, !dbg !378 + %10936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 58, !dbg !378 + %10937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 59, !dbg !378 + %10938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 60, !dbg !378 + %10939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 61, !dbg !378 + %10940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 62, !dbg !378 + %10941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 63, !dbg !378 + %10942 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10878, float %10879, float %10880, float %10881, float %10882, float %10883, float %10884, float %10885, float %10886, float %10887, float %10888, float %10889, float %10890, float %10891, float %10892, float %10893, float %10894, float %10895, float %10896, float %10897, float %10898, float %10899, float %10900, float %10901, float %10902, float %10903, float %10904, float %10905, float %10906, float %10907, float %10908, float %10909, float %10910, float %10911, float %10912, float %10913, float %10914, float %10915, float %10916, float %10917, float %10918, float %10919, float %10920, float %10921, float %10922, float %10923, float %10924, float %10925, float %10926, float %10927, float %10928, float %10929, float %10930, float %10931, float %10932, float %10933, float %10934, float %10935, float %10936, float %10937, float %10938, float %10939, float %10940, float %10941, i32 %10722, i32 %10725, i32 %10728, i32 %10731, i64 %10877, i1 true) #3, !dbg !378 + %10943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 0, !dbg !378 + %10944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 1, !dbg !378 + %10945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 2, !dbg !378 + %10946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 3, !dbg !378 + %10947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 4, !dbg !378 + %10948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 5, !dbg !378 + %10949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 6, !dbg !378 + %10950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 7, !dbg !378 + %10951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 8, !dbg !378 + %10952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 9, !dbg !378 + %10953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 10, !dbg !378 + %10954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 11, !dbg !378 + %10955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 12, !dbg !378 + %10956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 13, !dbg !378 + %10957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 14, !dbg !378 + %10958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 15, !dbg !378 + %10959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 16, !dbg !378 + %10960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 17, !dbg !378 + %10961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 18, !dbg !378 + %10962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 19, !dbg !378 + %10963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 20, !dbg !378 + %10964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 21, !dbg !378 + %10965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 22, !dbg !378 + %10966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 23, !dbg !378 + %10967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 24, !dbg !378 + %10968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 25, !dbg !378 + %10969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 26, !dbg !378 + %10970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 27, !dbg !378 + %10971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 28, !dbg !378 + %10972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 29, !dbg !378 + %10973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 30, !dbg !378 + %10974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 31, !dbg !378 + %10975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 32, !dbg !378 + %10976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 33, !dbg !378 + %10977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 34, !dbg !378 + %10978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 35, !dbg !378 + %10979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 36, !dbg !378 + %10980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 37, !dbg !378 + %10981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 38, !dbg !378 + %10982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 39, !dbg !378 + %10983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 40, !dbg !378 + %10984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 41, !dbg !378 + %10985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 42, !dbg !378 + %10986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 43, !dbg !378 + %10987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 44, !dbg !378 + %10988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 45, !dbg !378 + %10989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 46, !dbg !378 + %10990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 47, !dbg !378 + %10991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 48, !dbg !378 + %10992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 49, !dbg !378 + %10993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 50, !dbg !378 + %10994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 51, !dbg !378 + %10995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 52, !dbg !378 + %10996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 53, !dbg !378 + %10997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 54, !dbg !378 + %10998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 55, !dbg !378 + %10999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 56, !dbg !378 + %11000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 57, !dbg !378 + %11001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 58, !dbg !378 + %11002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 59, !dbg !378 + %11003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 60, !dbg !378 + %11004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 61, !dbg !378 + %11005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 62, !dbg !378 + %11006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 63, !dbg !378 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !378 + %11007 = add nuw nsw i32 %9038, 1, !dbg !361 + %11008 = lshr i32 %11007, 1, !dbg !379 + %11009 = zext nneg i32 %11008 to i64, !dbg !380 + %11010 = getelementptr i32, ptr addrspace(1) %5200, i64 %11009, !dbg !380 + %11011 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !381 + %11012 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %11010, i64 %11011, i1 %9040) #3, !dbg !381 + %11013 = add nuw nsw i32 %11008, 1, !dbg !382 + %11014 = icmp slt i32 %11013, %5204, !dbg !383 + %11015 = getelementptr i8, ptr addrspace(1) %11010, i64 4, !dbg !384 + %11016 = and i1 %9040, %11014, !dbg !361 + %11017 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !385 + %11018 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %11015, i64 %11017, i1 %11016) #3, !dbg !385 + %11019 = and i32 %9038, 1, !dbg !386 + %11020 = sub i32 %11018, %11012, !dbg !387 + %11021 = shl i32 %11020, 7, !dbg !388 + %11022 = add i32 %11021, -64, !dbg !389 + %11023 = xor i32 %11019, 1, !dbg !390 + %11024 = mul nuw nsw i32 %11022, %11023, !dbg !390 + %11025 = shl nuw nsw i32 %11019, 6, !dbg !391 + %11026 = add i32 %11024, %11025, !dbg !392 + %11027 = shl i32 %11026, 12, !dbg !393 + %11028 = sext i32 %11027 to i64, !dbg !359 + %11029 = getelementptr bfloat, ptr addrspace(1) %.pn5391752, i64 %11028, !dbg !359 + %11030 = getelementptr bfloat, ptr addrspace(1) %.pn5231753, i64 %11028, !dbg !359 + %11031 = getelementptr bfloat, ptr addrspace(1) %.pn5071754, i64 %11028, !dbg !359 + %11032 = getelementptr bfloat, ptr addrspace(1) %.pn4911755, i64 %11028, !dbg !359 + %11033 = shl i32 %11026, 7, !dbg !394 + %11034 = sext i32 %11033 to i64, !dbg !360 + %11035 = getelementptr bfloat, ptr addrspace(1) %.pn6031756, i64 %11034, !dbg !360 + %11036 = getelementptr bfloat, ptr addrspace(1) %.pn5871757, i64 %11034, !dbg !360 + %11037 = getelementptr bfloat, ptr addrspace(1) %.pn5711758, i64 %11034, !dbg !360 + %11038 = getelementptr bfloat, ptr addrspace(1) %.pn5551759, i64 %11034, !dbg !360 + %11039 = add i32 %11026, %.pn6351760, !dbg !395 + %11040 = add i32 %11026, %.pn6331761, !dbg !395 + %11041 = add i32 %11026, %.pn6311762, !dbg !395 + %11042 = add i32 %11026, %.pn6291763, !dbg !395 + %11043 = add i32 %11026, %.pn6271764, !dbg !395 + %11044 = add i32 %11026, %.pn6251765, !dbg !395 + %11045 = add i32 %11026, %.pn6231766, !dbg !395 + %11046 = add i32 %11026, %.pn6211767, !dbg !395 + %11047 = add i32 %11026, %.pn6191768, !dbg !395 + %11048 = add i32 %11026, %.pn6171769, !dbg !395 + %11049 = add i32 %11026, %.pn6151770, !dbg !395 + %11050 = add i32 %11026, %.pn6131771, !dbg !395 + %11051 = add i32 %11026, %.pn6111772, !dbg !395 + %11052 = add i32 %11026, %.pn6091773, !dbg !395 + %11053 = add i32 %11026, %.pn6071774, !dbg !395 + %11054 = add i32 %11026, %.pn6051775, !dbg !395 + %11055 = add i32 %11026, %9034, !dbg !395 + %11056 = add i32 %11026, %9035, !dbg !395 + %11057 = add i32 %11026, %9036, !dbg !395 + %11058 = add i32 %11026, %9037, !dbg !395 + %11059 = add i32 %11026, %9030, !dbg !395 + %11060 = add i32 %11026, %9031, !dbg !395 + %11061 = add i32 %11026, %9032, !dbg !395 + %11062 = add i32 %11026, %9033, !dbg !395 + %11063 = add i32 %9027, 1, !dbg !361 + %11064 = icmp sgt i32 %11063, 1, !dbg !361 + %11065 = select i1 %11064, i32 0, i32 %11063, !dbg !361 + %11066 = add i32 %9029, 1, !dbg !361 + %11067 = icmp sgt i32 %11066, 2, !dbg !361 + %11068 = select i1 %11067, i32 0, i32 %11066, !dbg !361 + %11069 = icmp slt i32 %11055, %18, !dbg !362 + %11070 = icmp slt i32 %11056, %18, !dbg !362 + %11071 = icmp slt i32 %11057, %18, !dbg !362 + %11072 = icmp slt i32 %11058, %18, !dbg !362 + %11073 = shl i32 %11068, 13, !dbg !353 + %11074 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %11073, !dbg !353 + %11075 = and i1 %9039, %11069, !dbg !361 + %11076 = and i1 %9039, %11070, !dbg !361 + %11077 = and i1 %9039, %11071, !dbg !361 + %11078 = and i1 %9039, %11072, !dbg !361 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !353 + %11079 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5264, !dbg !353 + %11080 = select i1 %11075, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %11079, ptr addrspace(1) %11029, i32 %11080) #3, !dbg !353 + %11081 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5267, !dbg !353 + %11082 = select i1 %11076, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11081, ptr addrspace(1) %11030, i32 %11082) #3, !dbg !353 + %11083 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5270, !dbg !353 + %11084 = select i1 %11077, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11083, ptr addrspace(1) %11031, i32 %11084) #3, !dbg !353 + %11085 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5273, !dbg !353 + %11086 = select i1 %11078, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11085, ptr addrspace(1) %11032, i32 %11086) #3, !dbg !353 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !353 + %11087 = icmp slt i32 %11039, %18, !dbg !396 + %11088 = icmp slt i32 %11040, %18, !dbg !396 + %11089 = icmp slt i32 %11041, %18, !dbg !396 + %11090 = icmp slt i32 %11042, %18, !dbg !396 + %11091 = icmp slt i32 %11043, %18, !dbg !396 + %11092 = icmp slt i32 %11044, %18, !dbg !396 + %11093 = icmp slt i32 %11045, %18, !dbg !396 + %11094 = icmp slt i32 %11046, %18, !dbg !396 + %11095 = icmp slt i32 %11047, %18, !dbg !396 + %11096 = icmp slt i32 %11048, %18, !dbg !396 + %11097 = icmp slt i32 %11049, %18, !dbg !396 + %11098 = icmp slt i32 %11050, %18, !dbg !396 + %11099 = icmp slt i32 %11051, %18, !dbg !396 + %11100 = icmp slt i32 %11052, %18, !dbg !396 + %11101 = icmp slt i32 %11053, %18, !dbg !396 + %11102 = icmp slt i32 %11054, %18, !dbg !396 + %11103 = sext i32 %11039 to i64, !dbg !354 + %11104 = getelementptr float, ptr addrspace(1) %5904, i64 %11103, !dbg !354 + %11105 = sext i32 %11040 to i64, !dbg !354 + %11106 = getelementptr float, ptr addrspace(1) %5904, i64 %11105, !dbg !354 + %11107 = sext i32 %11041 to i64, !dbg !354 + %11108 = getelementptr float, ptr addrspace(1) %5904, i64 %11107, !dbg !354 + %11109 = sext i32 %11042 to i64, !dbg !354 + %11110 = getelementptr float, ptr addrspace(1) %5904, i64 %11109, !dbg !354 + %11111 = sext i32 %11043 to i64, !dbg !354 + %11112 = getelementptr float, ptr addrspace(1) %5904, i64 %11111, !dbg !354 + %11113 = sext i32 %11044 to i64, !dbg !354 + %11114 = getelementptr float, ptr addrspace(1) %5904, i64 %11113, !dbg !354 + %11115 = sext i32 %11045 to i64, !dbg !354 + %11116 = getelementptr float, ptr addrspace(1) %5904, i64 %11115, !dbg !354 + %11117 = sext i32 %11046 to i64, !dbg !354 + %11118 = getelementptr float, ptr addrspace(1) %5904, i64 %11117, !dbg !354 + %11119 = sext i32 %11047 to i64, !dbg !354 + %11120 = getelementptr float, ptr addrspace(1) %5904, i64 %11119, !dbg !354 + %11121 = sext i32 %11048 to i64, !dbg !354 + %11122 = getelementptr float, ptr addrspace(1) %5904, i64 %11121, !dbg !354 + %11123 = sext i32 %11049 to i64, !dbg !354 + %11124 = getelementptr float, ptr addrspace(1) %5904, i64 %11123, !dbg !354 + %11125 = sext i32 %11050 to i64, !dbg !354 + %11126 = getelementptr float, ptr addrspace(1) %5904, i64 %11125, !dbg !354 + %11127 = sext i32 %11051 to i64, !dbg !354 + %11128 = getelementptr float, ptr addrspace(1) %5904, i64 %11127, !dbg !354 + %11129 = sext i32 %11052 to i64, !dbg !354 + %11130 = getelementptr float, ptr addrspace(1) %5904, i64 %11129, !dbg !354 + %11131 = sext i32 %11053 to i64, !dbg !354 + %11132 = getelementptr float, ptr addrspace(1) %5904, i64 %11131, !dbg !354 + %11133 = sext i32 %11054 to i64, !dbg !354 + %11134 = getelementptr float, ptr addrspace(1) %5904, i64 %11133, !dbg !354 + %11135 = shl i32 %11065, 6, !dbg !355 + %11136 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %11135, !dbg !355 + %11137 = and i1 %9039, %11087, !dbg !361 + %11138 = and i1 %9039, %11088, !dbg !361 + %11139 = and i1 %9039, %11089, !dbg !361 + %11140 = and i1 %9039, %11090, !dbg !361 + %11141 = and i1 %9039, %11091, !dbg !361 + %11142 = and i1 %9039, %11092, !dbg !361 + %11143 = and i1 %9039, %11093, !dbg !361 + %11144 = and i1 %9039, %11094, !dbg !361 + %11145 = and i1 %9039, %11095, !dbg !361 + %11146 = and i1 %9039, %11096, !dbg !361 + %11147 = and i1 %9039, %11097, !dbg !361 + %11148 = and i1 %9039, %11098, !dbg !361 + %11149 = and i1 %9039, %11099, !dbg !361 + %11150 = and i1 %9039, %11100, !dbg !361 + %11151 = and i1 %9039, %11101, !dbg !361 + %11152 = and i1 %9039, %11102, !dbg !361 + %11153 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5332, !dbg !355 + %11154 = select i1 %11137, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %11153, ptr addrspace(1) %11104, i32 %11154, i1 %5331) #3, !dbg !355 + %11155 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5334, !dbg !355 + %11156 = select i1 %11138, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11155, ptr addrspace(1) %11106, i32 %11156, i1 %5331) #3, !dbg !355 + %11157 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5336, !dbg !355 + %11158 = select i1 %11139, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11157, ptr addrspace(1) %11108, i32 %11158, i1 %5331) #3, !dbg !355 + %11159 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5339, !dbg !355 + %11160 = select i1 %11140, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11159, ptr addrspace(1) %11110, i32 %11160, i1 %5331) #3, !dbg !355 + %11161 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5342, !dbg !355 + %11162 = select i1 %11141, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11161, ptr addrspace(1) %11112, i32 %11162, i1 %5331) #3, !dbg !355 + %11163 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5345, !dbg !355 + %11164 = select i1 %11142, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11163, ptr addrspace(1) %11114, i32 %11164, i1 %5331) #3, !dbg !355 + %11165 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5348, !dbg !355 + %11166 = select i1 %11143, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11165, ptr addrspace(1) %11116, i32 %11166, i1 %5331) #3, !dbg !355 + %11167 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5351, !dbg !355 + %11168 = select i1 %11144, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11167, ptr addrspace(1) %11118, i32 %11168, i1 %5331) #3, !dbg !355 + %11169 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5354, !dbg !355 + %11170 = select i1 %11145, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11169, ptr addrspace(1) %11120, i32 %11170, i1 %5331) #3, !dbg !355 + %11171 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5357, !dbg !355 + %11172 = select i1 %11146, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11171, ptr addrspace(1) %11122, i32 %11172, i1 %5331) #3, !dbg !355 + %11173 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5360, !dbg !355 + %11174 = select i1 %11147, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11173, ptr addrspace(1) %11124, i32 %11174, i1 %5331) #3, !dbg !355 + %11175 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5363, !dbg !355 + %11176 = select i1 %11148, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11175, ptr addrspace(1) %11126, i32 %11176, i1 %5331) #3, !dbg !355 + %11177 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5366, !dbg !355 + %11178 = select i1 %11149, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11177, ptr addrspace(1) %11128, i32 %11178, i1 %5331) #3, !dbg !355 + %11179 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5369, !dbg !355 + %11180 = select i1 %11150, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11179, ptr addrspace(1) %11130, i32 %11180, i1 %5331) #3, !dbg !355 + %11181 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5372, !dbg !355 + %11182 = select i1 %11151, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11181, ptr addrspace(1) %11132, i32 %11182, i1 %5331) #3, !dbg !355 + %11183 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5375, !dbg !355 + %11184 = select i1 %11152, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11183, ptr addrspace(1) %11134, i32 %11184, i1 %5331) #3, !dbg !355 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !355 + %11185 = icmp slt i32 %11059, %18, !dbg !397 + %11186 = icmp slt i32 %11060, %18, !dbg !397 + %11187 = icmp slt i32 %11061, %18, !dbg !397 + %11188 = icmp slt i32 %11062, %18, !dbg !397 + %11189 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %11073, !dbg !356 + %11190 = and i1 %9039, %11185, !dbg !361 + %11191 = and i1 %9039, %11186, !dbg !361 + %11192 = and i1 %9039, %11187, !dbg !361 + %11193 = and i1 %9039, %11188, !dbg !361 + %11194 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5264, !dbg !356 + %11195 = select i1 %11190, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %11194, ptr addrspace(1) %11035, i32 %11195) #3, !dbg !356 + %11196 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5267, !dbg !356 + %11197 = select i1 %11191, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11196, ptr addrspace(1) %11036, i32 %11197) #3, !dbg !356 + %11198 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5270, !dbg !356 + %11199 = select i1 %11192, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11198, ptr addrspace(1) %11037, i32 %11199) #3, !dbg !356 + %11200 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5273, !dbg !356 + %11201 = select i1 %11193, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11200, ptr addrspace(1) %11038, i32 %11201) #3, !dbg !356 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !356 + %11202 = getelementptr float, ptr addrspace(1) %5905, i64 %11103, !dbg !357 + %11203 = getelementptr float, ptr addrspace(1) %5905, i64 %11105, !dbg !357 + %11204 = getelementptr float, ptr addrspace(1) %5905, i64 %11107, !dbg !357 + %11205 = getelementptr float, ptr addrspace(1) %5905, i64 %11109, !dbg !357 + %11206 = getelementptr float, ptr addrspace(1) %5905, i64 %11111, !dbg !357 + %11207 = getelementptr float, ptr addrspace(1) %5905, i64 %11113, !dbg !357 + %11208 = getelementptr float, ptr addrspace(1) %5905, i64 %11115, !dbg !357 + %11209 = getelementptr float, ptr addrspace(1) %5905, i64 %11117, !dbg !357 + %11210 = getelementptr float, ptr addrspace(1) %5905, i64 %11119, !dbg !357 + %11211 = getelementptr float, ptr addrspace(1) %5905, i64 %11121, !dbg !357 + %11212 = getelementptr float, ptr addrspace(1) %5905, i64 %11123, !dbg !357 + %11213 = getelementptr float, ptr addrspace(1) %5905, i64 %11125, !dbg !357 + %11214 = getelementptr float, ptr addrspace(1) %5905, i64 %11127, !dbg !357 + %11215 = getelementptr float, ptr addrspace(1) %5905, i64 %11129, !dbg !357 + %11216 = getelementptr float, ptr addrspace(1) %5905, i64 %11131, !dbg !357 + %11217 = getelementptr float, ptr addrspace(1) %5905, i64 %11133, !dbg !357 + %11218 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %11135, !dbg !358 + %11219 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5332, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %11219, ptr addrspace(1) %11202, i32 %11154, i1 %5331) #3, !dbg !358 + %11220 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5334, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11220, ptr addrspace(1) %11203, i32 %11156, i1 %5331) #3, !dbg !358 + %11221 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5336, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11221, ptr addrspace(1) %11204, i32 %11158, i1 %5331) #3, !dbg !358 + %11222 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5339, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11222, ptr addrspace(1) %11205, i32 %11160, i1 %5331) #3, !dbg !358 + %11223 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5342, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11223, ptr addrspace(1) %11206, i32 %11162, i1 %5331) #3, !dbg !358 + %11224 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5345, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11224, ptr addrspace(1) %11207, i32 %11164, i1 %5331) #3, !dbg !358 + %11225 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5348, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11225, ptr addrspace(1) %11208, i32 %11166, i1 %5331) #3, !dbg !358 + %11226 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5351, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11226, ptr addrspace(1) %11209, i32 %11168, i1 %5331) #3, !dbg !358 + %11227 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5354, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11227, ptr addrspace(1) %11210, i32 %11170, i1 %5331) #3, !dbg !358 + %11228 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5357, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11228, ptr addrspace(1) %11211, i32 %11172, i1 %5331) #3, !dbg !358 + %11229 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5360, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11229, ptr addrspace(1) %11212, i32 %11174, i1 %5331) #3, !dbg !358 + %11230 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5363, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11230, ptr addrspace(1) %11213, i32 %11176, i1 %5331) #3, !dbg !358 + %11231 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5366, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11231, ptr addrspace(1) %11214, i32 %11178, i1 %5331) #3, !dbg !358 + %11232 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5369, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11232, ptr addrspace(1) %11215, i32 %11180, i1 %5331) #3, !dbg !358 + %11233 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5372, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11233, ptr addrspace(1) %11216, i32 %11182, i1 %5331) #3, !dbg !358 + %11234 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5375, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11234, ptr addrspace(1) %11217, i32 %11184, i1 %5331) #3, !dbg !358 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !358 + %exitcond2188.not = icmp eq i32 %11007, %smax2187, !dbg !361 + br i1 %exitcond2188.not, label %._crit_edge1794, label %.lr.ph1793, !dbg !361 + +._crit_edge1794: ; preds = %__nv_exp2f.exit1239, %._crit_edge1621 + %.pn347.lcssa = phi float [ %8874, %._crit_edge1621 ], [ %10943, %__nv_exp2f.exit1239 ] + %.pn345.lcssa = phi float [ %8875, %._crit_edge1621 ], [ %10944, %__nv_exp2f.exit1239 ] + %.pn343.lcssa = phi float [ %8876, %._crit_edge1621 ], [ %10945, %__nv_exp2f.exit1239 ] + %.pn341.lcssa = phi float [ %8877, %._crit_edge1621 ], [ %10946, %__nv_exp2f.exit1239 ] + %.pn339.lcssa = phi float [ %8878, %._crit_edge1621 ], [ %10947, %__nv_exp2f.exit1239 ] + %.pn337.lcssa = phi float [ %8879, %._crit_edge1621 ], [ %10948, %__nv_exp2f.exit1239 ] + %.pn335.lcssa = phi float [ %8880, %._crit_edge1621 ], [ %10949, %__nv_exp2f.exit1239 ] + %.pn333.lcssa = phi float [ %8881, %._crit_edge1621 ], [ %10950, %__nv_exp2f.exit1239 ] + %.pn331.lcssa = phi float [ %8882, %._crit_edge1621 ], [ %10951, %__nv_exp2f.exit1239 ] + %.pn329.lcssa = phi float [ %8883, %._crit_edge1621 ], [ %10952, %__nv_exp2f.exit1239 ] + %.pn327.lcssa = phi float [ %8884, %._crit_edge1621 ], [ %10953, %__nv_exp2f.exit1239 ] + %.pn325.lcssa = phi float [ %8885, %._crit_edge1621 ], [ %10954, %__nv_exp2f.exit1239 ] + %.pn323.lcssa = phi float [ %8886, %._crit_edge1621 ], [ %10955, %__nv_exp2f.exit1239 ] + %.pn321.lcssa = phi float [ %8887, %._crit_edge1621 ], [ %10956, %__nv_exp2f.exit1239 ] + %.pn319.lcssa = phi float [ %8888, %._crit_edge1621 ], [ %10957, %__nv_exp2f.exit1239 ] + %.pn317.lcssa = phi float [ %8889, %._crit_edge1621 ], [ %10958, %__nv_exp2f.exit1239 ] + %.pn315.lcssa = phi float [ %8890, %._crit_edge1621 ], [ %10959, %__nv_exp2f.exit1239 ] + %.pn313.lcssa = phi float [ %8891, %._crit_edge1621 ], [ %10960, %__nv_exp2f.exit1239 ] + %.pn311.lcssa = phi float [ %8892, %._crit_edge1621 ], [ %10961, %__nv_exp2f.exit1239 ] + %.pn309.lcssa = phi float [ %8893, %._crit_edge1621 ], [ %10962, %__nv_exp2f.exit1239 ] + %.pn307.lcssa = phi float [ %8894, %._crit_edge1621 ], [ %10963, %__nv_exp2f.exit1239 ] + %.pn305.lcssa = phi float [ %8895, %._crit_edge1621 ], [ %10964, %__nv_exp2f.exit1239 ] + %.pn303.lcssa = phi float [ %8896, %._crit_edge1621 ], [ %10965, %__nv_exp2f.exit1239 ] + %.pn301.lcssa = phi float [ %8897, %._crit_edge1621 ], [ %10966, %__nv_exp2f.exit1239 ] + %.pn299.lcssa = phi float [ %8898, %._crit_edge1621 ], [ %10967, %__nv_exp2f.exit1239 ] + %.pn297.lcssa = phi float [ %8899, %._crit_edge1621 ], [ %10968, %__nv_exp2f.exit1239 ] + %.pn295.lcssa = phi float [ %8900, %._crit_edge1621 ], [ %10969, %__nv_exp2f.exit1239 ] + %.pn293.lcssa = phi float [ %8901, %._crit_edge1621 ], [ %10970, %__nv_exp2f.exit1239 ] + %.pn291.lcssa = phi float [ %8902, %._crit_edge1621 ], [ %10971, %__nv_exp2f.exit1239 ] + %.pn289.lcssa = phi float [ %8903, %._crit_edge1621 ], [ %10972, %__nv_exp2f.exit1239 ] + %.pn287.lcssa = phi float [ %8904, %._crit_edge1621 ], [ %10973, %__nv_exp2f.exit1239 ] + %.pn285.lcssa = phi float [ %8905, %._crit_edge1621 ], [ %10974, %__nv_exp2f.exit1239 ] + %.pn283.lcssa = phi float [ %8906, %._crit_edge1621 ], [ %10975, %__nv_exp2f.exit1239 ] + %.pn281.lcssa = phi float [ %8907, %._crit_edge1621 ], [ %10976, %__nv_exp2f.exit1239 ] + %.pn279.lcssa = phi float [ %8908, %._crit_edge1621 ], [ %10977, %__nv_exp2f.exit1239 ] + %.pn277.lcssa = phi float [ %8909, %._crit_edge1621 ], [ %10978, %__nv_exp2f.exit1239 ] + %.pn275.lcssa = phi float [ %8910, %._crit_edge1621 ], [ %10979, %__nv_exp2f.exit1239 ] + %.pn273.lcssa = phi float [ %8911, %._crit_edge1621 ], [ %10980, %__nv_exp2f.exit1239 ] + %.pn271.lcssa = phi float [ %8912, %._crit_edge1621 ], [ %10981, %__nv_exp2f.exit1239 ] + %.pn269.lcssa = phi float [ %8913, %._crit_edge1621 ], [ %10982, %__nv_exp2f.exit1239 ] + %.pn267.lcssa = phi float [ %8914, %._crit_edge1621 ], [ %10983, %__nv_exp2f.exit1239 ] + %.pn265.lcssa = phi float [ %8915, %._crit_edge1621 ], [ %10984, %__nv_exp2f.exit1239 ] + %.pn263.lcssa = phi float [ %8916, %._crit_edge1621 ], [ %10985, %__nv_exp2f.exit1239 ] + %.pn261.lcssa = phi float [ %8917, %._crit_edge1621 ], [ %10986, %__nv_exp2f.exit1239 ] + %.pn259.lcssa = phi float [ %8918, %._crit_edge1621 ], [ %10987, %__nv_exp2f.exit1239 ] + %.pn257.lcssa = phi float [ %8919, %._crit_edge1621 ], [ %10988, %__nv_exp2f.exit1239 ] + %.pn255.lcssa = phi float [ %8920, %._crit_edge1621 ], [ %10989, %__nv_exp2f.exit1239 ] + %.pn253.lcssa = phi float [ %8921, %._crit_edge1621 ], [ %10990, %__nv_exp2f.exit1239 ] + %.pn251.lcssa = phi float [ %8922, %._crit_edge1621 ], [ %10991, %__nv_exp2f.exit1239 ] + %.pn249.lcssa = phi float [ %8923, %._crit_edge1621 ], [ %10992, %__nv_exp2f.exit1239 ] + %.pn247.lcssa = phi float [ %8924, %._crit_edge1621 ], [ %10993, %__nv_exp2f.exit1239 ] + %.pn245.lcssa = phi float [ %8925, %._crit_edge1621 ], [ %10994, %__nv_exp2f.exit1239 ] + %.pn243.lcssa = phi float [ %8926, %._crit_edge1621 ], [ %10995, %__nv_exp2f.exit1239 ] + %.pn241.lcssa = phi float [ %8927, %._crit_edge1621 ], [ %10996, %__nv_exp2f.exit1239 ] + %.pn239.lcssa = phi float [ %8928, %._crit_edge1621 ], [ %10997, %__nv_exp2f.exit1239 ] + %.pn237.lcssa = phi float [ %8929, %._crit_edge1621 ], [ %10998, %__nv_exp2f.exit1239 ] + %.pn235.lcssa = phi float [ %8930, %._crit_edge1621 ], [ %10999, %__nv_exp2f.exit1239 ] + %.pn233.lcssa = phi float [ %8931, %._crit_edge1621 ], [ %11000, %__nv_exp2f.exit1239 ] + %.pn231.lcssa = phi float [ %8932, %._crit_edge1621 ], [ %11001, %__nv_exp2f.exit1239 ] + %.pn229.lcssa = phi float [ %8933, %._crit_edge1621 ], [ %11002, %__nv_exp2f.exit1239 ] + %.pn227.lcssa = phi float [ %8934, %._crit_edge1621 ], [ %11003, %__nv_exp2f.exit1239 ] + %.pn225.lcssa = phi float [ %8935, %._crit_edge1621 ], [ %11004, %__nv_exp2f.exit1239 ] + %.pn223.lcssa = phi float [ %8936, %._crit_edge1621 ], [ %11005, %__nv_exp2f.exit1239 ] + %.pn221.lcssa = phi float [ %8937, %._crit_edge1621 ], [ %11006, %__nv_exp2f.exit1239 ] + %.pn475.lcssa = phi float [ %8810, %._crit_edge1621 ], [ %10087, %__nv_exp2f.exit1239 ] + %.pn473.lcssa = phi float [ %8811, %._crit_edge1621 ], [ %10088, %__nv_exp2f.exit1239 ] + %.pn471.lcssa = phi float [ %8812, %._crit_edge1621 ], [ %10089, %__nv_exp2f.exit1239 ] + %.pn469.lcssa = phi float [ %8813, %._crit_edge1621 ], [ %10090, %__nv_exp2f.exit1239 ] + %.pn467.lcssa = phi float [ %8814, %._crit_edge1621 ], [ %10091, %__nv_exp2f.exit1239 ] + %.pn465.lcssa = phi float [ %8815, %._crit_edge1621 ], [ %10092, %__nv_exp2f.exit1239 ] + %.pn463.lcssa = phi float [ %8816, %._crit_edge1621 ], [ %10093, %__nv_exp2f.exit1239 ] + %.pn461.lcssa = phi float [ %8817, %._crit_edge1621 ], [ %10094, %__nv_exp2f.exit1239 ] + %.pn459.lcssa = phi float [ %8818, %._crit_edge1621 ], [ %10095, %__nv_exp2f.exit1239 ] + %.pn457.lcssa = phi float [ %8819, %._crit_edge1621 ], [ %10096, %__nv_exp2f.exit1239 ] + %.pn455.lcssa = phi float [ %8820, %._crit_edge1621 ], [ %10097, %__nv_exp2f.exit1239 ] + %.pn453.lcssa = phi float [ %8821, %._crit_edge1621 ], [ %10098, %__nv_exp2f.exit1239 ] + %.pn451.lcssa = phi float [ %8822, %._crit_edge1621 ], [ %10099, %__nv_exp2f.exit1239 ] + %.pn449.lcssa = phi float [ %8823, %._crit_edge1621 ], [ %10100, %__nv_exp2f.exit1239 ] + %.pn447.lcssa = phi float [ %8824, %._crit_edge1621 ], [ %10101, %__nv_exp2f.exit1239 ] + %.pn445.lcssa = phi float [ %8825, %._crit_edge1621 ], [ %10102, %__nv_exp2f.exit1239 ] + %.pn443.lcssa = phi float [ %8826, %._crit_edge1621 ], [ %10103, %__nv_exp2f.exit1239 ] + %.pn441.lcssa = phi float [ %8827, %._crit_edge1621 ], [ %10104, %__nv_exp2f.exit1239 ] + %.pn439.lcssa = phi float [ %8828, %._crit_edge1621 ], [ %10105, %__nv_exp2f.exit1239 ] + %.pn437.lcssa = phi float [ %8829, %._crit_edge1621 ], [ %10106, %__nv_exp2f.exit1239 ] + %.pn435.lcssa = phi float [ %8830, %._crit_edge1621 ], [ %10107, %__nv_exp2f.exit1239 ] + %.pn433.lcssa = phi float [ %8831, %._crit_edge1621 ], [ %10108, %__nv_exp2f.exit1239 ] + %.pn431.lcssa = phi float [ %8832, %._crit_edge1621 ], [ %10109, %__nv_exp2f.exit1239 ] + %.pn429.lcssa = phi float [ %8833, %._crit_edge1621 ], [ %10110, %__nv_exp2f.exit1239 ] + %.pn427.lcssa = phi float [ %8834, %._crit_edge1621 ], [ %10111, %__nv_exp2f.exit1239 ] + %.pn425.lcssa = phi float [ %8835, %._crit_edge1621 ], [ %10112, %__nv_exp2f.exit1239 ] + %.pn423.lcssa = phi float [ %8836, %._crit_edge1621 ], [ %10113, %__nv_exp2f.exit1239 ] + %.pn421.lcssa = phi float [ %8837, %._crit_edge1621 ], [ %10114, %__nv_exp2f.exit1239 ] + %.pn419.lcssa = phi float [ %8838, %._crit_edge1621 ], [ %10115, %__nv_exp2f.exit1239 ] + %.pn417.lcssa = phi float [ %8839, %._crit_edge1621 ], [ %10116, %__nv_exp2f.exit1239 ] + %.pn415.lcssa = phi float [ %8840, %._crit_edge1621 ], [ %10117, %__nv_exp2f.exit1239 ] + %.pn413.lcssa = phi float [ %8841, %._crit_edge1621 ], [ %10118, %__nv_exp2f.exit1239 ] + %.pn411.lcssa = phi float [ %8842, %._crit_edge1621 ], [ %10119, %__nv_exp2f.exit1239 ] + %.pn409.lcssa = phi float [ %8843, %._crit_edge1621 ], [ %10120, %__nv_exp2f.exit1239 ] + %.pn407.lcssa = phi float [ %8844, %._crit_edge1621 ], [ %10121, %__nv_exp2f.exit1239 ] + %.pn405.lcssa = phi float [ %8845, %._crit_edge1621 ], [ %10122, %__nv_exp2f.exit1239 ] + %.pn403.lcssa = phi float [ %8846, %._crit_edge1621 ], [ %10123, %__nv_exp2f.exit1239 ] + %.pn401.lcssa = phi float [ %8847, %._crit_edge1621 ], [ %10124, %__nv_exp2f.exit1239 ] + %.pn399.lcssa = phi float [ %8848, %._crit_edge1621 ], [ %10125, %__nv_exp2f.exit1239 ] + %.pn397.lcssa = phi float [ %8849, %._crit_edge1621 ], [ %10126, %__nv_exp2f.exit1239 ] + %.pn395.lcssa = phi float [ %8850, %._crit_edge1621 ], [ %10127, %__nv_exp2f.exit1239 ] + %.pn393.lcssa = phi float [ %8851, %._crit_edge1621 ], [ %10128, %__nv_exp2f.exit1239 ] + %.pn391.lcssa = phi float [ %8852, %._crit_edge1621 ], [ %10129, %__nv_exp2f.exit1239 ] + %.pn389.lcssa = phi float [ %8853, %._crit_edge1621 ], [ %10130, %__nv_exp2f.exit1239 ] + %.pn387.lcssa = phi float [ %8854, %._crit_edge1621 ], [ %10131, %__nv_exp2f.exit1239 ] + %.pn385.lcssa = phi float [ %8855, %._crit_edge1621 ], [ %10132, %__nv_exp2f.exit1239 ] + %.pn383.lcssa = phi float [ %8856, %._crit_edge1621 ], [ %10133, %__nv_exp2f.exit1239 ] + %.pn381.lcssa = phi float [ %8857, %._crit_edge1621 ], [ %10134, %__nv_exp2f.exit1239 ] + %.pn379.lcssa = phi float [ %8858, %._crit_edge1621 ], [ %10135, %__nv_exp2f.exit1239 ] + %.pn377.lcssa = phi float [ %8859, %._crit_edge1621 ], [ %10136, %__nv_exp2f.exit1239 ] + %.pn375.lcssa = phi float [ %8860, %._crit_edge1621 ], [ %10137, %__nv_exp2f.exit1239 ] + %.pn373.lcssa = phi float [ %8861, %._crit_edge1621 ], [ %10138, %__nv_exp2f.exit1239 ] + %.pn371.lcssa = phi float [ %8862, %._crit_edge1621 ], [ %10139, %__nv_exp2f.exit1239 ] + %.pn369.lcssa = phi float [ %8863, %._crit_edge1621 ], [ %10140, %__nv_exp2f.exit1239 ] + %.pn367.lcssa = phi float [ %8864, %._crit_edge1621 ], [ %10141, %__nv_exp2f.exit1239 ] + %.pn365.lcssa = phi float [ %8865, %._crit_edge1621 ], [ %10142, %__nv_exp2f.exit1239 ] + %.pn363.lcssa = phi float [ %8866, %._crit_edge1621 ], [ %10143, %__nv_exp2f.exit1239 ] + %.pn361.lcssa = phi float [ %8867, %._crit_edge1621 ], [ %10144, %__nv_exp2f.exit1239 ] + %.pn359.lcssa = phi float [ %8868, %._crit_edge1621 ], [ %10145, %__nv_exp2f.exit1239 ] + %.pn357.lcssa = phi float [ %8869, %._crit_edge1621 ], [ %10146, %__nv_exp2f.exit1239 ] + %.pn355.lcssa = phi float [ %8870, %._crit_edge1621 ], [ %10147, %__nv_exp2f.exit1239 ] + %.pn353.lcssa = phi float [ %8871, %._crit_edge1621 ], [ %10148, %__nv_exp2f.exit1239 ] + %.pn351.lcssa = phi float [ %8872, %._crit_edge1621 ], [ %10149, %__nv_exp2f.exit1239 ] + %.pn349.lcssa = phi float [ %8873, %._crit_edge1621 ], [ %10150, %__nv_exp2f.exit1239 ] + %11235 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %.pn475.lcssa, float %.pn473.lcssa, float %.pn471.lcssa, float %.pn469.lcssa, float %.pn467.lcssa, float %.pn465.lcssa, float %.pn463.lcssa, float %.pn461.lcssa, float %.pn459.lcssa, float %.pn457.lcssa, float %.pn455.lcssa, float %.pn453.lcssa, float %.pn451.lcssa, float %.pn449.lcssa, float %.pn447.lcssa, float %.pn445.lcssa, float %.pn443.lcssa, float %.pn441.lcssa, float %.pn439.lcssa, float %.pn437.lcssa, float %.pn435.lcssa, float %.pn433.lcssa, float %.pn431.lcssa, float %.pn429.lcssa, float %.pn427.lcssa, float %.pn425.lcssa, float %.pn423.lcssa, float %.pn421.lcssa, float %.pn419.lcssa, float %.pn417.lcssa, float %.pn415.lcssa, float %.pn413.lcssa, float %.pn411.lcssa, float %.pn409.lcssa, float %.pn407.lcssa, float %.pn405.lcssa, float %.pn403.lcssa, float %.pn401.lcssa, float %.pn399.lcssa, float %.pn397.lcssa, float %.pn395.lcssa, float %.pn393.lcssa, float %.pn391.lcssa, float %.pn389.lcssa, float %.pn387.lcssa, float %.pn385.lcssa, float %.pn383.lcssa, float %.pn381.lcssa, float %.pn379.lcssa, float %.pn377.lcssa, float %.pn375.lcssa, float %.pn373.lcssa, float %.pn371.lcssa, float %.pn369.lcssa, float %.pn367.lcssa, float %.pn365.lcssa, float %.pn363.lcssa, float %.pn361.lcssa, float %.pn359.lcssa, float %.pn357.lcssa, float %.pn355.lcssa, float %.pn353.lcssa, float %.pn351.lcssa, float %.pn349.lcssa, float %.pn347.lcssa, float %.pn345.lcssa, float %.pn343.lcssa, float %.pn341.lcssa, float %.pn339.lcssa, float %.pn337.lcssa, float %.pn335.lcssa, float %.pn333.lcssa, float %.pn331.lcssa, float %.pn329.lcssa, float %.pn327.lcssa, float %.pn325.lcssa, float %.pn323.lcssa, float %.pn321.lcssa, float %.pn319.lcssa, float %.pn317.lcssa, float %.pn315.lcssa, float %.pn313.lcssa, float %.pn311.lcssa, float %.pn309.lcssa, float %.pn307.lcssa, float %.pn305.lcssa, float %.pn303.lcssa, float %.pn301.lcssa, float %.pn299.lcssa, float %.pn297.lcssa, float %.pn295.lcssa, float %.pn293.lcssa, float %.pn291.lcssa, float %.pn289.lcssa, float %.pn287.lcssa, float %.pn285.lcssa, float %.pn283.lcssa, float %.pn281.lcssa, float %.pn279.lcssa, float %.pn277.lcssa, float %.pn275.lcssa, float %.pn273.lcssa, float %.pn271.lcssa, float %.pn269.lcssa, float %.pn267.lcssa, float %.pn265.lcssa, float %.pn263.lcssa, float %.pn261.lcssa, float %.pn259.lcssa, float %.pn257.lcssa, float %.pn255.lcssa, float %.pn253.lcssa, float %.pn251.lcssa, float %.pn249.lcssa, float %.pn247.lcssa, float %.pn245.lcssa, float %.pn243.lcssa, float %.pn241.lcssa, float %.pn239.lcssa, float %.pn237.lcssa, float %.pn235.lcssa, float %.pn233.lcssa, float %.pn231.lcssa, float %.pn229.lcssa, float %.pn227.lcssa, float %.pn225.lcssa, float %.pn223.lcssa, float %.pn221.lcssa) #3, !dbg !361 + %11236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 0, !dbg !361 + %11237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 1, !dbg !361 + %11238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 2, !dbg !361 + %11239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 3, !dbg !361 + %11240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 4, !dbg !361 + %11241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 5, !dbg !361 + %11242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 6, !dbg !361 + %11243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 7, !dbg !361 + %11244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 8, !dbg !361 + %11245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 9, !dbg !361 + %11246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 10, !dbg !361 + %11247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 11, !dbg !361 + %11248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 12, !dbg !361 + %11249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 13, !dbg !361 + %11250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 14, !dbg !361 + %11251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 15, !dbg !361 + %11252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 16, !dbg !361 + %11253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 17, !dbg !361 + %11254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 18, !dbg !361 + %11255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 19, !dbg !361 + %11256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 20, !dbg !361 + %11257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 21, !dbg !361 + %11258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 22, !dbg !361 + %11259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 23, !dbg !361 + %11260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 24, !dbg !361 + %11261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 25, !dbg !361 + %11262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 26, !dbg !361 + %11263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 27, !dbg !361 + %11264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 28, !dbg !361 + %11265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 29, !dbg !361 + %11266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 30, !dbg !361 + %11267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 31, !dbg !361 + %11268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 32, !dbg !361 + %11269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 33, !dbg !361 + %11270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 34, !dbg !361 + %11271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 35, !dbg !361 + %11272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 36, !dbg !361 + %11273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 37, !dbg !361 + %11274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 38, !dbg !361 + %11275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 39, !dbg !361 + %11276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 40, !dbg !361 + %11277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 41, !dbg !361 + %11278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 42, !dbg !361 + %11279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 43, !dbg !361 + %11280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 44, !dbg !361 + %11281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 45, !dbg !361 + %11282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 46, !dbg !361 + %11283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 47, !dbg !361 + %11284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 48, !dbg !361 + %11285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 49, !dbg !361 + %11286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 50, !dbg !361 + %11287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 51, !dbg !361 + %11288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 52, !dbg !361 + %11289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 53, !dbg !361 + %11290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 54, !dbg !361 + %11291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 55, !dbg !361 + %11292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 56, !dbg !361 + %11293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 57, !dbg !361 + %11294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 58, !dbg !361 + %11295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 59, !dbg !361 + %11296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 60, !dbg !361 + %11297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 61, !dbg !361 + %11298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 62, !dbg !361 + %11299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 63, !dbg !361 + %11300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 64, !dbg !361 + %11301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 65, !dbg !361 + %11302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 66, !dbg !361 + %11303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 67, !dbg !361 + %11304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 68, !dbg !361 + %11305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 69, !dbg !361 + %11306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 70, !dbg !361 + %11307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 71, !dbg !361 + %11308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 72, !dbg !361 + %11309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 73, !dbg !361 + %11310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 74, !dbg !361 + %11311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 75, !dbg !361 + %11312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 76, !dbg !361 + %11313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 77, !dbg !361 + %11314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 78, !dbg !361 + %11315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 79, !dbg !361 + %11316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 80, !dbg !361 + %11317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 81, !dbg !361 + %11318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 82, !dbg !361 + %11319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 83, !dbg !361 + %11320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 84, !dbg !361 + %11321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 85, !dbg !361 + %11322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 86, !dbg !361 + %11323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 87, !dbg !361 + %11324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 88, !dbg !361 + %11325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 89, !dbg !361 + %11326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 90, !dbg !361 + %11327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 91, !dbg !361 + %11328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 92, !dbg !361 + %11329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 93, !dbg !361 + %11330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 94, !dbg !361 + %11331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 95, !dbg !361 + %11332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 96, !dbg !361 + %11333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 97, !dbg !361 + %11334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 98, !dbg !361 + %11335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 99, !dbg !361 + %11336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 100, !dbg !361 + %11337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 101, !dbg !361 + %11338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 102, !dbg !361 + %11339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 103, !dbg !361 + %11340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 104, !dbg !361 + %11341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 105, !dbg !361 + %11342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 106, !dbg !361 + %11343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 107, !dbg !361 + %11344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 108, !dbg !361 + %11345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 109, !dbg !361 + %11346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 110, !dbg !361 + %11347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 111, !dbg !361 + %11348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 112, !dbg !361 + %11349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 113, !dbg !361 + %11350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 114, !dbg !361 + %11351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 115, !dbg !361 + %11352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 116, !dbg !361 + %11353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 117, !dbg !361 + %11354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 118, !dbg !361 + %11355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 119, !dbg !361 + %11356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 120, !dbg !361 + %11357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 121, !dbg !361 + %11358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 122, !dbg !361 + %11359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 123, !dbg !361 + %11360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 124, !dbg !361 + %11361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 125, !dbg !361 + %11362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 126, !dbg !361 + %11363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 127, !dbg !361 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !361 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !361 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !276 + %exitcond2189.not = icmp eq i64 %indvars.iv.next, 4, !dbg !276 + br i1 %exitcond2189.not, label %11364, label %5761, !dbg !276 + +11364: ; preds = %._crit_edge1794 + %11365 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4893, !dbg !398 + %11366 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4895, !dbg !398 + %11367 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4897, !dbg !398 + %11368 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4899, !dbg !398 + %11369 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4901, !dbg !398 + %11370 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4903, !dbg !398 + %11371 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4905, !dbg !398 + %11372 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4907, !dbg !398 + %11373 = getelementptr bfloat, ptr addrspace(1) %11365, i64 %4911, !dbg !399 + %11374 = getelementptr bfloat, ptr addrspace(1) %11366, i64 %4911, !dbg !399 + %11375 = getelementptr bfloat, ptr addrspace(1) %11367, i64 %4911, !dbg !399 + %11376 = getelementptr bfloat, ptr addrspace(1) %11368, i64 %4911, !dbg !399 + %11377 = getelementptr bfloat, ptr addrspace(1) %11369, i64 %4911, !dbg !399 + %11378 = getelementptr bfloat, ptr addrspace(1) %11370, i64 %4911, !dbg !399 + %11379 = getelementptr bfloat, ptr addrspace(1) %11371, i64 %4911, !dbg !399 + %11380 = getelementptr bfloat, ptr addrspace(1) %11372, i64 %4911, !dbg !399 + %11381 = insertelement <2 x float> poison, float %11236, i64 0, !dbg !400 + %11382 = insertelement <2 x float> %11381, float %11237, i64 1, !dbg !400 + %11383 = fptrunc <2 x float> %11382 to <2 x bfloat>, !dbg !400 + %11384 = insertelement <2 x float> poison, float %11238, i64 0, !dbg !400 + %11385 = insertelement <2 x float> %11384, float %11239, i64 1, !dbg !400 + %11386 = fptrunc <2 x float> %11385 to <2 x bfloat>, !dbg !400 + %11387 = insertelement <2 x float> poison, float %11240, i64 0, !dbg !400 + %11388 = insertelement <2 x float> %11387, float %11241, i64 1, !dbg !400 + %11389 = fptrunc <2 x float> %11388 to <2 x bfloat>, !dbg !400 + %11390 = insertelement <2 x float> poison, float %11242, i64 0, !dbg !400 + %11391 = insertelement <2 x float> %11390, float %11243, i64 1, !dbg !400 + %11392 = fptrunc <2 x float> %11391 to <2 x bfloat>, !dbg !400 + %11393 = insertelement <2 x float> poison, float %11244, i64 0, !dbg !400 + %11394 = insertelement <2 x float> %11393, float %11245, i64 1, !dbg !400 + %11395 = fptrunc <2 x float> %11394 to <2 x bfloat>, !dbg !400 + %11396 = insertelement <2 x float> poison, float %11246, i64 0, !dbg !400 + %11397 = insertelement <2 x float> %11396, float %11247, i64 1, !dbg !400 + %11398 = fptrunc <2 x float> %11397 to <2 x bfloat>, !dbg !400 + %11399 = insertelement <2 x float> poison, float %11248, i64 0, !dbg !400 + %11400 = insertelement <2 x float> %11399, float %11249, i64 1, !dbg !400 + %11401 = fptrunc <2 x float> %11400 to <2 x bfloat>, !dbg !400 + %11402 = insertelement <2 x float> poison, float %11250, i64 0, !dbg !400 + %11403 = insertelement <2 x float> %11402, float %11251, i64 1, !dbg !400 + %11404 = fptrunc <2 x float> %11403 to <2 x bfloat>, !dbg !400 + %11405 = insertelement <2 x float> poison, float %11252, i64 0, !dbg !400 + %11406 = insertelement <2 x float> %11405, float %11253, i64 1, !dbg !400 + %11407 = fptrunc <2 x float> %11406 to <2 x bfloat>, !dbg !400 + %11408 = insertelement <2 x float> poison, float %11254, i64 0, !dbg !400 + %11409 = insertelement <2 x float> %11408, float %11255, i64 1, !dbg !400 + %11410 = fptrunc <2 x float> %11409 to <2 x bfloat>, !dbg !400 + %11411 = insertelement <2 x float> poison, float %11256, i64 0, !dbg !400 + %11412 = insertelement <2 x float> %11411, float %11257, i64 1, !dbg !400 + %11413 = fptrunc <2 x float> %11412 to <2 x bfloat>, !dbg !400 + %11414 = insertelement <2 x float> poison, float %11258, i64 0, !dbg !400 + %11415 = insertelement <2 x float> %11414, float %11259, i64 1, !dbg !400 + %11416 = fptrunc <2 x float> %11415 to <2 x bfloat>, !dbg !400 + %11417 = insertelement <2 x float> poison, float %11260, i64 0, !dbg !400 + %11418 = insertelement <2 x float> %11417, float %11261, i64 1, !dbg !400 + %11419 = fptrunc <2 x float> %11418 to <2 x bfloat>, !dbg !400 + %11420 = insertelement <2 x float> poison, float %11262, i64 0, !dbg !400 + %11421 = insertelement <2 x float> %11420, float %11263, i64 1, !dbg !400 + %11422 = fptrunc <2 x float> %11421 to <2 x bfloat>, !dbg !400 + %11423 = insertelement <2 x float> poison, float %11264, i64 0, !dbg !400 + %11424 = insertelement <2 x float> %11423, float %11265, i64 1, !dbg !400 + %11425 = fptrunc <2 x float> %11424 to <2 x bfloat>, !dbg !400 + %11426 = insertelement <2 x float> poison, float %11266, i64 0, !dbg !400 + %11427 = insertelement <2 x float> %11426, float %11267, i64 1, !dbg !400 + %11428 = fptrunc <2 x float> %11427 to <2 x bfloat>, !dbg !400 + %11429 = insertelement <2 x float> poison, float %11268, i64 0, !dbg !400 + %11430 = insertelement <2 x float> %11429, float %11269, i64 1, !dbg !400 + %11431 = fptrunc <2 x float> %11430 to <2 x bfloat>, !dbg !400 + %11432 = insertelement <2 x float> poison, float %11270, i64 0, !dbg !400 + %11433 = insertelement <2 x float> %11432, float %11271, i64 1, !dbg !400 + %11434 = fptrunc <2 x float> %11433 to <2 x bfloat>, !dbg !400 + %11435 = insertelement <2 x float> poison, float %11272, i64 0, !dbg !400 + %11436 = insertelement <2 x float> %11435, float %11273, i64 1, !dbg !400 + %11437 = fptrunc <2 x float> %11436 to <2 x bfloat>, !dbg !400 + %11438 = insertelement <2 x float> poison, float %11274, i64 0, !dbg !400 + %11439 = insertelement <2 x float> %11438, float %11275, i64 1, !dbg !400 + %11440 = fptrunc <2 x float> %11439 to <2 x bfloat>, !dbg !400 + %11441 = insertelement <2 x float> poison, float %11276, i64 0, !dbg !400 + %11442 = insertelement <2 x float> %11441, float %11277, i64 1, !dbg !400 + %11443 = fptrunc <2 x float> %11442 to <2 x bfloat>, !dbg !400 + %11444 = insertelement <2 x float> poison, float %11278, i64 0, !dbg !400 + %11445 = insertelement <2 x float> %11444, float %11279, i64 1, !dbg !400 + %11446 = fptrunc <2 x float> %11445 to <2 x bfloat>, !dbg !400 + %11447 = insertelement <2 x float> poison, float %11280, i64 0, !dbg !400 + %11448 = insertelement <2 x float> %11447, float %11281, i64 1, !dbg !400 + %11449 = fptrunc <2 x float> %11448 to <2 x bfloat>, !dbg !400 + %11450 = insertelement <2 x float> poison, float %11282, i64 0, !dbg !400 + %11451 = insertelement <2 x float> %11450, float %11283, i64 1, !dbg !400 + %11452 = fptrunc <2 x float> %11451 to <2 x bfloat>, !dbg !400 + %11453 = insertelement <2 x float> poison, float %11284, i64 0, !dbg !400 + %11454 = insertelement <2 x float> %11453, float %11285, i64 1, !dbg !400 + %11455 = fptrunc <2 x float> %11454 to <2 x bfloat>, !dbg !400 + %11456 = insertelement <2 x float> poison, float %11286, i64 0, !dbg !400 + %11457 = insertelement <2 x float> %11456, float %11287, i64 1, !dbg !400 + %11458 = fptrunc <2 x float> %11457 to <2 x bfloat>, !dbg !400 + %11459 = insertelement <2 x float> poison, float %11288, i64 0, !dbg !400 + %11460 = insertelement <2 x float> %11459, float %11289, i64 1, !dbg !400 + %11461 = fptrunc <2 x float> %11460 to <2 x bfloat>, !dbg !400 + %11462 = insertelement <2 x float> poison, float %11290, i64 0, !dbg !400 + %11463 = insertelement <2 x float> %11462, float %11291, i64 1, !dbg !400 + %11464 = fptrunc <2 x float> %11463 to <2 x bfloat>, !dbg !400 + %11465 = insertelement <2 x float> poison, float %11292, i64 0, !dbg !400 + %11466 = insertelement <2 x float> %11465, float %11293, i64 1, !dbg !400 + %11467 = fptrunc <2 x float> %11466 to <2 x bfloat>, !dbg !400 + %11468 = insertelement <2 x float> poison, float %11294, i64 0, !dbg !400 + %11469 = insertelement <2 x float> %11468, float %11295, i64 1, !dbg !400 + %11470 = fptrunc <2 x float> %11469 to <2 x bfloat>, !dbg !400 + %11471 = insertelement <2 x float> poison, float %11296, i64 0, !dbg !400 + %11472 = insertelement <2 x float> %11471, float %11297, i64 1, !dbg !400 + %11473 = fptrunc <2 x float> %11472 to <2 x bfloat>, !dbg !400 + %11474 = insertelement <2 x float> poison, float %11298, i64 0, !dbg !400 + %11475 = insertelement <2 x float> %11474, float %11299, i64 1, !dbg !400 + %11476 = fptrunc <2 x float> %11475 to <2 x bfloat>, !dbg !400 + %11477 = shl nuw nsw i32 %5134, 13, !dbg !400 + %11478 = shl nuw nsw i32 %56, 5, !dbg !400 + %11479 = and i32 %11478, 7264, !dbg !400 + %11480 = and i32 %56, 24, !dbg !400 + %11481 = shl nuw nsw i32 %11480, 4, !dbg !400 + %11482 = shl nuw nsw i32 %56, 2, !dbg !400 + %11483 = and i32 %11482, 16, !dbg !400 + %11484 = or disjoint i32 %11477, %11483, !dbg !400 + %11485 = or disjoint i32 %11479, %11481, !dbg !400 + %11486 = or disjoint i32 %11484, %11485, !dbg !400 + %11487 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11486, !dbg !400 + %11488 = bitcast <2 x bfloat> %11383 to i32, !dbg !400 + %11489 = bitcast <2 x bfloat> %11389 to i32, !dbg !400 + %11490 = bitcast <2 x bfloat> %11395 to i32, !dbg !400 + %11491 = bitcast <2 x bfloat> %11401 to i32, !dbg !400 + %11492 = insertelement <4 x i32> poison, i32 %11488, i64 0, !dbg !400 + %11493 = insertelement <4 x i32> %11492, i32 %11489, i64 1, !dbg !400 + %11494 = insertelement <4 x i32> %11493, i32 %11490, i64 2, !dbg !400 + %11495 = insertelement <4 x i32> %11494, i32 %11491, i64 3, !dbg !400 + store <4 x i32> %11495, ptr addrspace(3) %11487, align 16, !dbg !400 + %11496 = getelementptr inbounds nuw i8, ptr addrspace(3) %11487, i32 512, !dbg !400 + %11497 = bitcast <2 x bfloat> %11386 to i32, !dbg !400 + %11498 = bitcast <2 x bfloat> %11392 to i32, !dbg !400 + %11499 = bitcast <2 x bfloat> %11398 to i32, !dbg !400 + %11500 = bitcast <2 x bfloat> %11404 to i32, !dbg !400 + %11501 = insertelement <4 x i32> poison, i32 %11497, i64 0, !dbg !400 + %11502 = insertelement <4 x i32> %11501, i32 %11498, i64 1, !dbg !400 + %11503 = insertelement <4 x i32> %11502, i32 %11499, i64 2, !dbg !400 + %11504 = insertelement <4 x i32> %11503, i32 %11500, i64 3, !dbg !400 + store <4 x i32> %11504, ptr addrspace(3) %11496, align 16, !dbg !400 + %11505 = xor i32 %11486, 32, !dbg !400 + %11506 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11505, !dbg !400 + %11507 = bitcast <2 x bfloat> %11407 to i32, !dbg !400 + %11508 = bitcast <2 x bfloat> %11413 to i32, !dbg !400 + %11509 = bitcast <2 x bfloat> %11419 to i32, !dbg !400 + %11510 = bitcast <2 x bfloat> %11425 to i32, !dbg !400 + %11511 = insertelement <4 x i32> poison, i32 %11507, i64 0, !dbg !400 + %11512 = insertelement <4 x i32> %11511, i32 %11508, i64 1, !dbg !400 + %11513 = insertelement <4 x i32> %11512, i32 %11509, i64 2, !dbg !400 + %11514 = insertelement <4 x i32> %11513, i32 %11510, i64 3, !dbg !400 + store <4 x i32> %11514, ptr addrspace(3) %11506, align 16, !dbg !400 + %11515 = getelementptr inbounds nuw i8, ptr addrspace(3) %11506, i32 512, !dbg !400 + %11516 = bitcast <2 x bfloat> %11410 to i32, !dbg !400 + %11517 = bitcast <2 x bfloat> %11416 to i32, !dbg !400 + %11518 = bitcast <2 x bfloat> %11422 to i32, !dbg !400 + %11519 = bitcast <2 x bfloat> %11428 to i32, !dbg !400 + %11520 = insertelement <4 x i32> poison, i32 %11516, i64 0, !dbg !400 + %11521 = insertelement <4 x i32> %11520, i32 %11517, i64 1, !dbg !400 + %11522 = insertelement <4 x i32> %11521, i32 %11518, i64 2, !dbg !400 + %11523 = insertelement <4 x i32> %11522, i32 %11519, i64 3, !dbg !400 + store <4 x i32> %11523, ptr addrspace(3) %11515, align 16, !dbg !400 + %11524 = xor i32 %11486, 64, !dbg !400 + %11525 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11524, !dbg !400 + %11526 = bitcast <2 x bfloat> %11431 to i32, !dbg !400 + %11527 = bitcast <2 x bfloat> %11437 to i32, !dbg !400 + %11528 = bitcast <2 x bfloat> %11443 to i32, !dbg !400 + %11529 = bitcast <2 x bfloat> %11449 to i32, !dbg !400 + %11530 = insertelement <4 x i32> poison, i32 %11526, i64 0, !dbg !400 + %11531 = insertelement <4 x i32> %11530, i32 %11527, i64 1, !dbg !400 + %11532 = insertelement <4 x i32> %11531, i32 %11528, i64 2, !dbg !400 + %11533 = insertelement <4 x i32> %11532, i32 %11529, i64 3, !dbg !400 + store <4 x i32> %11533, ptr addrspace(3) %11525, align 16, !dbg !400 + %11534 = getelementptr inbounds nuw i8, ptr addrspace(3) %11525, i32 512, !dbg !400 + %11535 = bitcast <2 x bfloat> %11434 to i32, !dbg !400 + %11536 = bitcast <2 x bfloat> %11440 to i32, !dbg !400 + %11537 = bitcast <2 x bfloat> %11446 to i32, !dbg !400 + %11538 = bitcast <2 x bfloat> %11452 to i32, !dbg !400 + %11539 = insertelement <4 x i32> poison, i32 %11535, i64 0, !dbg !400 + %11540 = insertelement <4 x i32> %11539, i32 %11536, i64 1, !dbg !400 + %11541 = insertelement <4 x i32> %11540, i32 %11537, i64 2, !dbg !400 + %11542 = insertelement <4 x i32> %11541, i32 %11538, i64 3, !dbg !400 + store <4 x i32> %11542, ptr addrspace(3) %11534, align 16, !dbg !400 + %11543 = xor i32 %11486, 96, !dbg !400 + %11544 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11543, !dbg !400 + %11545 = bitcast <2 x bfloat> %11455 to i32, !dbg !400 + %11546 = bitcast <2 x bfloat> %11461 to i32, !dbg !400 + %11547 = bitcast <2 x bfloat> %11467 to i32, !dbg !400 + %11548 = bitcast <2 x bfloat> %11473 to i32, !dbg !400 + %11549 = insertelement <4 x i32> poison, i32 %11545, i64 0, !dbg !400 + %11550 = insertelement <4 x i32> %11549, i32 %11546, i64 1, !dbg !400 + %11551 = insertelement <4 x i32> %11550, i32 %11547, i64 2, !dbg !400 + %11552 = insertelement <4 x i32> %11551, i32 %11548, i64 3, !dbg !400 + store <4 x i32> %11552, ptr addrspace(3) %11544, align 16, !dbg !400 + %11553 = getelementptr inbounds nuw i8, ptr addrspace(3) %11544, i32 512, !dbg !400 + %11554 = bitcast <2 x bfloat> %11458 to i32, !dbg !400 + %11555 = bitcast <2 x bfloat> %11464 to i32, !dbg !400 + %11556 = bitcast <2 x bfloat> %11470 to i32, !dbg !400 + %11557 = bitcast <2 x bfloat> %11476 to i32, !dbg !400 + %11558 = insertelement <4 x i32> poison, i32 %11554, i64 0, !dbg !400 + %11559 = insertelement <4 x i32> %11558, i32 %11555, i64 1, !dbg !400 + %11560 = insertelement <4 x i32> %11559, i32 %11556, i64 2, !dbg !400 + %11561 = insertelement <4 x i32> %11560, i32 %11557, i64 3, !dbg !400 + store <4 x i32> %11561, ptr addrspace(3) %11553, align 16, !dbg !400 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !400 + %11562 = shl nuw nsw i32 %11480, 10, !dbg !400 + %11563 = shl nuw nsw i32 %5134, 5, !dbg !400 + %11564 = and i32 %11482, 1008, !dbg !400 + %11565 = or disjoint i32 %11562, %11563, !dbg !400 + %11566 = xor i32 %11565, %11564, !dbg !400 + %11567 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11566, !dbg !400 + %11568 = ptrtoint ptr addrspace(3) %11567 to i32, !dbg !400 + %11569 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11568) #3, !dbg !400 + %11570 = extractvalue { i32, i32, i32, i32 } %11569, 0, !dbg !400 + %11571 = extractvalue { i32, i32, i32, i32 } %11569, 1, !dbg !400 + %11572 = extractvalue { i32, i32, i32, i32 } %11569, 2, !dbg !400 + %11573 = extractvalue { i32, i32, i32, i32 } %11569, 3, !dbg !400 + %11574 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 1024, !dbg !400 + %11575 = ptrtoint ptr addrspace(3) %11574 to i32, !dbg !400 + %11576 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11575) #3, !dbg !400 + %11577 = extractvalue { i32, i32, i32, i32 } %11576, 0, !dbg !400 + %11578 = extractvalue { i32, i32, i32, i32 } %11576, 1, !dbg !400 + %11579 = extractvalue { i32, i32, i32, i32 } %11576, 2, !dbg !400 + %11580 = extractvalue { i32, i32, i32, i32 } %11576, 3, !dbg !400 + %11581 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 2048, !dbg !400 + %11582 = ptrtoint ptr addrspace(3) %11581 to i32, !dbg !400 + %11583 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11582) #3, !dbg !400 + %11584 = extractvalue { i32, i32, i32, i32 } %11583, 0, !dbg !400 + %11585 = extractvalue { i32, i32, i32, i32 } %11583, 1, !dbg !400 + %11586 = extractvalue { i32, i32, i32, i32 } %11583, 2, !dbg !400 + %11587 = extractvalue { i32, i32, i32, i32 } %11583, 3, !dbg !400 + %11588 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 3072, !dbg !400 + %11589 = ptrtoint ptr addrspace(3) %11588 to i32, !dbg !400 + %11590 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11589) #3, !dbg !400 + %11591 = extractvalue { i32, i32, i32, i32 } %11590, 0, !dbg !400 + %11592 = extractvalue { i32, i32, i32, i32 } %11590, 1, !dbg !400 + %11593 = extractvalue { i32, i32, i32, i32 } %11590, 2, !dbg !400 + %11594 = extractvalue { i32, i32, i32, i32 } %11590, 3, !dbg !400 + %11595 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 4096, !dbg !400 + %11596 = ptrtoint ptr addrspace(3) %11595 to i32, !dbg !400 + %11597 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11596) #3, !dbg !400 + %11598 = extractvalue { i32, i32, i32, i32 } %11597, 0, !dbg !400 + %11599 = extractvalue { i32, i32, i32, i32 } %11597, 1, !dbg !400 + %11600 = extractvalue { i32, i32, i32, i32 } %11597, 2, !dbg !400 + %11601 = extractvalue { i32, i32, i32, i32 } %11597, 3, !dbg !400 + %11602 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 5120, !dbg !400 + %11603 = ptrtoint ptr addrspace(3) %11602 to i32, !dbg !400 + %11604 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11603) #3, !dbg !400 + %11605 = extractvalue { i32, i32, i32, i32 } %11604, 0, !dbg !400 + %11606 = extractvalue { i32, i32, i32, i32 } %11604, 1, !dbg !400 + %11607 = extractvalue { i32, i32, i32, i32 } %11604, 2, !dbg !400 + %11608 = extractvalue { i32, i32, i32, i32 } %11604, 3, !dbg !400 + %11609 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 6144, !dbg !400 + %11610 = ptrtoint ptr addrspace(3) %11609 to i32, !dbg !400 + %11611 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11610) #3, !dbg !400 + %11612 = extractvalue { i32, i32, i32, i32 } %11611, 0, !dbg !400 + %11613 = extractvalue { i32, i32, i32, i32 } %11611, 1, !dbg !400 + %11614 = extractvalue { i32, i32, i32, i32 } %11611, 2, !dbg !400 + %11615 = extractvalue { i32, i32, i32, i32 } %11611, 3, !dbg !400 + %11616 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 7168, !dbg !400 + %11617 = ptrtoint ptr addrspace(3) %11616 to i32, !dbg !400 + %11618 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11617) #3, !dbg !400 + %11619 = extractvalue { i32, i32, i32, i32 } %11618, 0, !dbg !400 + %11620 = extractvalue { i32, i32, i32, i32 } %11618, 1, !dbg !400 + %11621 = extractvalue { i32, i32, i32, i32 } %11618, 2, !dbg !400 + %11622 = extractvalue { i32, i32, i32, i32 } %11618, 3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11570, i32 %11571, i32 %11572, i32 %11573, ptr addrspace(1) %11373, i1 %4920) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11577, i32 %11578, i32 %11579, i32 %11580, ptr addrspace(1) %11374, i1 %4921) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11584, i32 %11585, i32 %11586, i32 %11587, ptr addrspace(1) %11375, i1 %4922) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11591, i32 %11592, i32 %11593, i32 %11594, ptr addrspace(1) %11376, i1 %4923) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11598, i32 %11599, i32 %11600, i32 %11601, ptr addrspace(1) %11377, i1 %4924) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11605, i32 %11606, i32 %11607, i32 %11608, ptr addrspace(1) %11378, i1 %4925) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11612, i32 %11613, i32 %11614, i32 %11615, ptr addrspace(1) %11379, i1 %4926) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11619, i32 %11620, i32 %11621, i32 %11622, ptr addrspace(1) %11380, i1 %4927) #3, !dbg !400 + %11623 = insertelement <2 x float> poison, float %11300, i64 0, !dbg !401 + %11624 = insertelement <2 x float> %11623, float %11301, i64 1, !dbg !401 + %11625 = fmul <2 x float> %11624, splat (float 0x3FB6A09E60000000), !dbg !401 + %11626 = insertelement <2 x float> poison, float %11302, i64 0, !dbg !401 + %11627 = insertelement <2 x float> %11626, float %11303, i64 1, !dbg !401 + %11628 = fmul <2 x float> %11627, splat (float 0x3FB6A09E60000000), !dbg !401 + %11629 = insertelement <2 x float> poison, float %11304, i64 0, !dbg !401 + %11630 = insertelement <2 x float> %11629, float %11305, i64 1, !dbg !401 + %11631 = fmul <2 x float> %11630, splat (float 0x3FB6A09E60000000), !dbg !401 + %11632 = insertelement <2 x float> poison, float %11306, i64 0, !dbg !401 + %11633 = insertelement <2 x float> %11632, float %11307, i64 1, !dbg !401 + %11634 = fmul <2 x float> %11633, splat (float 0x3FB6A09E60000000), !dbg !401 + %11635 = insertelement <2 x float> poison, float %11308, i64 0, !dbg !401 + %11636 = insertelement <2 x float> %11635, float %11309, i64 1, !dbg !401 + %11637 = fmul <2 x float> %11636, splat (float 0x3FB6A09E60000000), !dbg !401 + %11638 = insertelement <2 x float> poison, float %11310, i64 0, !dbg !401 + %11639 = insertelement <2 x float> %11638, float %11311, i64 1, !dbg !401 + %11640 = fmul <2 x float> %11639, splat (float 0x3FB6A09E60000000), !dbg !401 + %11641 = insertelement <2 x float> poison, float %11312, i64 0, !dbg !401 + %11642 = insertelement <2 x float> %11641, float %11313, i64 1, !dbg !401 + %11643 = fmul <2 x float> %11642, splat (float 0x3FB6A09E60000000), !dbg !401 + %11644 = insertelement <2 x float> poison, float %11314, i64 0, !dbg !401 + %11645 = insertelement <2 x float> %11644, float %11315, i64 1, !dbg !401 + %11646 = fmul <2 x float> %11645, splat (float 0x3FB6A09E60000000), !dbg !401 + %11647 = insertelement <2 x float> poison, float %11316, i64 0, !dbg !401 + %11648 = insertelement <2 x float> %11647, float %11317, i64 1, !dbg !401 + %11649 = fmul <2 x float> %11648, splat (float 0x3FB6A09E60000000), !dbg !401 + %11650 = insertelement <2 x float> poison, float %11318, i64 0, !dbg !401 + %11651 = insertelement <2 x float> %11650, float %11319, i64 1, !dbg !401 + %11652 = fmul <2 x float> %11651, splat (float 0x3FB6A09E60000000), !dbg !401 + %11653 = insertelement <2 x float> poison, float %11320, i64 0, !dbg !401 + %11654 = insertelement <2 x float> %11653, float %11321, i64 1, !dbg !401 + %11655 = fmul <2 x float> %11654, splat (float 0x3FB6A09E60000000), !dbg !401 + %11656 = insertelement <2 x float> poison, float %11322, i64 0, !dbg !401 + %11657 = insertelement <2 x float> %11656, float %11323, i64 1, !dbg !401 + %11658 = fmul <2 x float> %11657, splat (float 0x3FB6A09E60000000), !dbg !401 + %11659 = insertelement <2 x float> poison, float %11324, i64 0, !dbg !401 + %11660 = insertelement <2 x float> %11659, float %11325, i64 1, !dbg !401 + %11661 = fmul <2 x float> %11660, splat (float 0x3FB6A09E60000000), !dbg !401 + %11662 = insertelement <2 x float> poison, float %11326, i64 0, !dbg !401 + %11663 = insertelement <2 x float> %11662, float %11327, i64 1, !dbg !401 + %11664 = fmul <2 x float> %11663, splat (float 0x3FB6A09E60000000), !dbg !401 + %11665 = insertelement <2 x float> poison, float %11328, i64 0, !dbg !401 + %11666 = insertelement <2 x float> %11665, float %11329, i64 1, !dbg !401 + %11667 = fmul <2 x float> %11666, splat (float 0x3FB6A09E60000000), !dbg !401 + %11668 = insertelement <2 x float> poison, float %11330, i64 0, !dbg !401 + %11669 = insertelement <2 x float> %11668, float %11331, i64 1, !dbg !401 + %11670 = fmul <2 x float> %11669, splat (float 0x3FB6A09E60000000), !dbg !401 + %11671 = insertelement <2 x float> poison, float %11332, i64 0, !dbg !401 + %11672 = insertelement <2 x float> %11671, float %11333, i64 1, !dbg !401 + %11673 = fmul <2 x float> %11672, splat (float 0x3FB6A09E60000000), !dbg !401 + %11674 = insertelement <2 x float> poison, float %11334, i64 0, !dbg !401 + %11675 = insertelement <2 x float> %11674, float %11335, i64 1, !dbg !401 + %11676 = fmul <2 x float> %11675, splat (float 0x3FB6A09E60000000), !dbg !401 + %11677 = insertelement <2 x float> poison, float %11336, i64 0, !dbg !401 + %11678 = insertelement <2 x float> %11677, float %11337, i64 1, !dbg !401 + %11679 = fmul <2 x float> %11678, splat (float 0x3FB6A09E60000000), !dbg !401 + %11680 = insertelement <2 x float> poison, float %11338, i64 0, !dbg !401 + %11681 = insertelement <2 x float> %11680, float %11339, i64 1, !dbg !401 + %11682 = fmul <2 x float> %11681, splat (float 0x3FB6A09E60000000), !dbg !401 + %11683 = insertelement <2 x float> poison, float %11340, i64 0, !dbg !401 + %11684 = insertelement <2 x float> %11683, float %11341, i64 1, !dbg !401 + %11685 = fmul <2 x float> %11684, splat (float 0x3FB6A09E60000000), !dbg !401 + %11686 = insertelement <2 x float> poison, float %11342, i64 0, !dbg !401 + %11687 = insertelement <2 x float> %11686, float %11343, i64 1, !dbg !401 + %11688 = fmul <2 x float> %11687, splat (float 0x3FB6A09E60000000), !dbg !401 + %11689 = insertelement <2 x float> poison, float %11344, i64 0, !dbg !401 + %11690 = insertelement <2 x float> %11689, float %11345, i64 1, !dbg !401 + %11691 = fmul <2 x float> %11690, splat (float 0x3FB6A09E60000000), !dbg !401 + %11692 = insertelement <2 x float> poison, float %11346, i64 0, !dbg !401 + %11693 = insertelement <2 x float> %11692, float %11347, i64 1, !dbg !401 + %11694 = fmul <2 x float> %11693, splat (float 0x3FB6A09E60000000), !dbg !401 + %11695 = insertelement <2 x float> poison, float %11348, i64 0, !dbg !401 + %11696 = insertelement <2 x float> %11695, float %11349, i64 1, !dbg !401 + %11697 = fmul <2 x float> %11696, splat (float 0x3FB6A09E60000000), !dbg !401 + %11698 = insertelement <2 x float> poison, float %11350, i64 0, !dbg !401 + %11699 = insertelement <2 x float> %11698, float %11351, i64 1, !dbg !401 + %11700 = fmul <2 x float> %11699, splat (float 0x3FB6A09E60000000), !dbg !401 + %11701 = insertelement <2 x float> poison, float %11352, i64 0, !dbg !401 + %11702 = insertelement <2 x float> %11701, float %11353, i64 1, !dbg !401 + %11703 = fmul <2 x float> %11702, splat (float 0x3FB6A09E60000000), !dbg !401 + %11704 = insertelement <2 x float> poison, float %11354, i64 0, !dbg !401 + %11705 = insertelement <2 x float> %11704, float %11355, i64 1, !dbg !401 + %11706 = fmul <2 x float> %11705, splat (float 0x3FB6A09E60000000), !dbg !401 + %11707 = insertelement <2 x float> poison, float %11356, i64 0, !dbg !401 + %11708 = insertelement <2 x float> %11707, float %11357, i64 1, !dbg !401 + %11709 = fmul <2 x float> %11708, splat (float 0x3FB6A09E60000000), !dbg !401 + %11710 = insertelement <2 x float> poison, float %11358, i64 0, !dbg !401 + %11711 = insertelement <2 x float> %11710, float %11359, i64 1, !dbg !401 + %11712 = fmul <2 x float> %11711, splat (float 0x3FB6A09E60000000), !dbg !401 + %11713 = insertelement <2 x float> poison, float %11360, i64 0, !dbg !401 + %11714 = insertelement <2 x float> %11713, float %11361, i64 1, !dbg !401 + %11715 = fmul <2 x float> %11714, splat (float 0x3FB6A09E60000000), !dbg !401 + %11716 = insertelement <2 x float> poison, float %11362, i64 0, !dbg !401 + %11717 = insertelement <2 x float> %11716, float %11363, i64 1, !dbg !401 + %11718 = fmul <2 x float> %11717, splat (float 0x3FB6A09E60000000), !dbg !401 + %11719 = or disjoint i32 %4885, %4910, !dbg !402 + %11720 = or disjoint i32 %4886, %4910, !dbg !402 + %11721 = or disjoint i32 %4887, %4910, !dbg !402 + %11722 = or disjoint i32 %4888, %4910, !dbg !402 + %11723 = or disjoint i32 %4889, %4910, !dbg !402 + %11724 = or disjoint i32 %4890, %4910, !dbg !402 + %11725 = or disjoint i32 %4891, %4910, !dbg !402 + %11726 = or disjoint i32 %4892, %4910, !dbg !402 + %11727 = shl nuw nsw i32 %44, 7, !dbg !403 + %11728 = mul i32 %19, %11727, !dbg !404 + %11729 = add i32 %11719, %11728, !dbg !405 + %11730 = add i32 %11720, %11728, !dbg !405 + %11731 = add i32 %11721, %11728, !dbg !405 + %11732 = add i32 %11722, %11728, !dbg !405 + %11733 = add i32 %11723, %11728, !dbg !405 + %11734 = add i32 %11724, %11728, !dbg !405 + %11735 = add i32 %11725, %11728, !dbg !405 + %11736 = add i32 %11726, %11728, !dbg !405 + %11737 = shl nuw nsw i32 %43, 10, !dbg !406 + %11738 = mul i32 %19, %11737, !dbg !407 + %11739 = add i32 %11729, %11738, !dbg !408 + %11740 = add i32 %11730, %11738, !dbg !408 + %11741 = add i32 %11731, %11738, !dbg !408 + %11742 = add i32 %11732, %11738, !dbg !408 + %11743 = add i32 %11733, %11738, !dbg !408 + %11744 = add i32 %11734, %11738, !dbg !408 + %11745 = add i32 %11735, %11738, !dbg !408 + %11746 = add i32 %11736, %11738, !dbg !408 + %11747 = sext i32 %11739 to i64, !dbg !409 + %11748 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11747, !dbg !409 + %11749 = sext i32 %11740 to i64, !dbg !409 + %11750 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11749, !dbg !409 + %11751 = sext i32 %11741 to i64, !dbg !409 + %11752 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11751, !dbg !409 + %11753 = sext i32 %11742 to i64, !dbg !409 + %11754 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11753, !dbg !409 + %11755 = sext i32 %11743 to i64, !dbg !409 + %11756 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11755, !dbg !409 + %11757 = sext i32 %11744 to i64, !dbg !409 + %11758 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11757, !dbg !409 + %11759 = sext i32 %11745 to i64, !dbg !409 + %11760 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11759, !dbg !409 + %11761 = sext i32 %11746 to i64, !dbg !409 + %11762 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11761, !dbg !409 + %11763 = fptrunc <2 x float> %11625 to <2 x bfloat>, !dbg !410 + %11764 = fptrunc <2 x float> %11628 to <2 x bfloat>, !dbg !410 + %11765 = fptrunc <2 x float> %11631 to <2 x bfloat>, !dbg !410 + %11766 = fptrunc <2 x float> %11634 to <2 x bfloat>, !dbg !410 + %11767 = fptrunc <2 x float> %11637 to <2 x bfloat>, !dbg !410 + %11768 = fptrunc <2 x float> %11640 to <2 x bfloat>, !dbg !410 + %11769 = fptrunc <2 x float> %11643 to <2 x bfloat>, !dbg !410 + %11770 = fptrunc <2 x float> %11646 to <2 x bfloat>, !dbg !410 + %11771 = fptrunc <2 x float> %11649 to <2 x bfloat>, !dbg !410 + %11772 = fptrunc <2 x float> %11652 to <2 x bfloat>, !dbg !410 + %11773 = fptrunc <2 x float> %11655 to <2 x bfloat>, !dbg !410 + %11774 = fptrunc <2 x float> %11658 to <2 x bfloat>, !dbg !410 + %11775 = fptrunc <2 x float> %11661 to <2 x bfloat>, !dbg !410 + %11776 = fptrunc <2 x float> %11664 to <2 x bfloat>, !dbg !410 + %11777 = fptrunc <2 x float> %11667 to <2 x bfloat>, !dbg !410 + %11778 = fptrunc <2 x float> %11670 to <2 x bfloat>, !dbg !410 + %11779 = fptrunc <2 x float> %11673 to <2 x bfloat>, !dbg !410 + %11780 = fptrunc <2 x float> %11676 to <2 x bfloat>, !dbg !410 + %11781 = fptrunc <2 x float> %11679 to <2 x bfloat>, !dbg !410 + %11782 = fptrunc <2 x float> %11682 to <2 x bfloat>, !dbg !410 + %11783 = fptrunc <2 x float> %11685 to <2 x bfloat>, !dbg !410 + %11784 = fptrunc <2 x float> %11688 to <2 x bfloat>, !dbg !410 + %11785 = fptrunc <2 x float> %11691 to <2 x bfloat>, !dbg !410 + %11786 = fptrunc <2 x float> %11694 to <2 x bfloat>, !dbg !410 + %11787 = fptrunc <2 x float> %11697 to <2 x bfloat>, !dbg !410 + %11788 = fptrunc <2 x float> %11700 to <2 x bfloat>, !dbg !410 + %11789 = fptrunc <2 x float> %11703 to <2 x bfloat>, !dbg !410 + %11790 = fptrunc <2 x float> %11706 to <2 x bfloat>, !dbg !410 + %11791 = fptrunc <2 x float> %11709 to <2 x bfloat>, !dbg !410 + %11792 = fptrunc <2 x float> %11712 to <2 x bfloat>, !dbg !410 + %11793 = fptrunc <2 x float> %11715 to <2 x bfloat>, !dbg !410 + %11794 = fptrunc <2 x float> %11718 to <2 x bfloat>, !dbg !410 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !410 + %11795 = bitcast <2 x bfloat> %11763 to i32, !dbg !410 + %11796 = bitcast <2 x bfloat> %11765 to i32, !dbg !410 + %11797 = bitcast <2 x bfloat> %11767 to i32, !dbg !410 + %11798 = bitcast <2 x bfloat> %11769 to i32, !dbg !410 + %11799 = insertelement <4 x i32> poison, i32 %11795, i64 0, !dbg !410 + %11800 = insertelement <4 x i32> %11799, i32 %11796, i64 1, !dbg !410 + %11801 = insertelement <4 x i32> %11800, i32 %11797, i64 2, !dbg !410 + %11802 = insertelement <4 x i32> %11801, i32 %11798, i64 3, !dbg !410 + store <4 x i32> %11802, ptr addrspace(3) %11487, align 16, !dbg !410 + %11803 = bitcast <2 x bfloat> %11764 to i32, !dbg !410 + %11804 = bitcast <2 x bfloat> %11766 to i32, !dbg !410 + %11805 = bitcast <2 x bfloat> %11768 to i32, !dbg !410 + %11806 = bitcast <2 x bfloat> %11770 to i32, !dbg !410 + %11807 = insertelement <4 x i32> poison, i32 %11803, i64 0, !dbg !410 + %11808 = insertelement <4 x i32> %11807, i32 %11804, i64 1, !dbg !410 + %11809 = insertelement <4 x i32> %11808, i32 %11805, i64 2, !dbg !410 + %11810 = insertelement <4 x i32> %11809, i32 %11806, i64 3, !dbg !410 + store <4 x i32> %11810, ptr addrspace(3) %11496, align 16, !dbg !410 + %11811 = bitcast <2 x bfloat> %11771 to i32, !dbg !410 + %11812 = bitcast <2 x bfloat> %11773 to i32, !dbg !410 + %11813 = bitcast <2 x bfloat> %11775 to i32, !dbg !410 + %11814 = bitcast <2 x bfloat> %11777 to i32, !dbg !410 + %11815 = insertelement <4 x i32> poison, i32 %11811, i64 0, !dbg !410 + %11816 = insertelement <4 x i32> %11815, i32 %11812, i64 1, !dbg !410 + %11817 = insertelement <4 x i32> %11816, i32 %11813, i64 2, !dbg !410 + %11818 = insertelement <4 x i32> %11817, i32 %11814, i64 3, !dbg !410 + store <4 x i32> %11818, ptr addrspace(3) %11506, align 16, !dbg !410 + %11819 = bitcast <2 x bfloat> %11772 to i32, !dbg !410 + %11820 = bitcast <2 x bfloat> %11774 to i32, !dbg !410 + %11821 = bitcast <2 x bfloat> %11776 to i32, !dbg !410 + %11822 = bitcast <2 x bfloat> %11778 to i32, !dbg !410 + %11823 = insertelement <4 x i32> poison, i32 %11819, i64 0, !dbg !410 + %11824 = insertelement <4 x i32> %11823, i32 %11820, i64 1, !dbg !410 + %11825 = insertelement <4 x i32> %11824, i32 %11821, i64 2, !dbg !410 + %11826 = insertelement <4 x i32> %11825, i32 %11822, i64 3, !dbg !410 + store <4 x i32> %11826, ptr addrspace(3) %11515, align 16, !dbg !410 + %11827 = bitcast <2 x bfloat> %11779 to i32, !dbg !410 + %11828 = bitcast <2 x bfloat> %11781 to i32, !dbg !410 + %11829 = bitcast <2 x bfloat> %11783 to i32, !dbg !410 + %11830 = bitcast <2 x bfloat> %11785 to i32, !dbg !410 + %11831 = insertelement <4 x i32> poison, i32 %11827, i64 0, !dbg !410 + %11832 = insertelement <4 x i32> %11831, i32 %11828, i64 1, !dbg !410 + %11833 = insertelement <4 x i32> %11832, i32 %11829, i64 2, !dbg !410 + %11834 = insertelement <4 x i32> %11833, i32 %11830, i64 3, !dbg !410 + store <4 x i32> %11834, ptr addrspace(3) %11525, align 16, !dbg !410 + %11835 = bitcast <2 x bfloat> %11780 to i32, !dbg !410 + %11836 = bitcast <2 x bfloat> %11782 to i32, !dbg !410 + %11837 = bitcast <2 x bfloat> %11784 to i32, !dbg !410 + %11838 = bitcast <2 x bfloat> %11786 to i32, !dbg !410 + %11839 = insertelement <4 x i32> poison, i32 %11835, i64 0, !dbg !410 + %11840 = insertelement <4 x i32> %11839, i32 %11836, i64 1, !dbg !410 + %11841 = insertelement <4 x i32> %11840, i32 %11837, i64 2, !dbg !410 + %11842 = insertelement <4 x i32> %11841, i32 %11838, i64 3, !dbg !410 + store <4 x i32> %11842, ptr addrspace(3) %11534, align 16, !dbg !410 + %11843 = bitcast <2 x bfloat> %11787 to i32, !dbg !410 + %11844 = bitcast <2 x bfloat> %11789 to i32, !dbg !410 + %11845 = bitcast <2 x bfloat> %11791 to i32, !dbg !410 + %11846 = bitcast <2 x bfloat> %11793 to i32, !dbg !410 + %11847 = insertelement <4 x i32> poison, i32 %11843, i64 0, !dbg !410 + %11848 = insertelement <4 x i32> %11847, i32 %11844, i64 1, !dbg !410 + %11849 = insertelement <4 x i32> %11848, i32 %11845, i64 2, !dbg !410 + %11850 = insertelement <4 x i32> %11849, i32 %11846, i64 3, !dbg !410 + store <4 x i32> %11850, ptr addrspace(3) %11544, align 16, !dbg !410 + %11851 = bitcast <2 x bfloat> %11788 to i32, !dbg !410 + %11852 = bitcast <2 x bfloat> %11790 to i32, !dbg !410 + %11853 = bitcast <2 x bfloat> %11792 to i32, !dbg !410 + %11854 = bitcast <2 x bfloat> %11794 to i32, !dbg !410 + %11855 = insertelement <4 x i32> poison, i32 %11851, i64 0, !dbg !410 + %11856 = insertelement <4 x i32> %11855, i32 %11852, i64 1, !dbg !410 + %11857 = insertelement <4 x i32> %11856, i32 %11853, i64 2, !dbg !410 + %11858 = insertelement <4 x i32> %11857, i32 %11854, i64 3, !dbg !410 + store <4 x i32> %11858, ptr addrspace(3) %11553, align 16, !dbg !410 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !410 + %11859 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11568) #3, !dbg !410 + %11860 = extractvalue { i32, i32, i32, i32 } %11859, 0, !dbg !410 + %11861 = extractvalue { i32, i32, i32, i32 } %11859, 1, !dbg !410 + %11862 = extractvalue { i32, i32, i32, i32 } %11859, 2, !dbg !410 + %11863 = extractvalue { i32, i32, i32, i32 } %11859, 3, !dbg !410 + %11864 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11575) #3, !dbg !410 + %11865 = extractvalue { i32, i32, i32, i32 } %11864, 0, !dbg !410 + %11866 = extractvalue { i32, i32, i32, i32 } %11864, 1, !dbg !410 + %11867 = extractvalue { i32, i32, i32, i32 } %11864, 2, !dbg !410 + %11868 = extractvalue { i32, i32, i32, i32 } %11864, 3, !dbg !410 + %11869 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11582) #3, !dbg !410 + %11870 = extractvalue { i32, i32, i32, i32 } %11869, 0, !dbg !410 + %11871 = extractvalue { i32, i32, i32, i32 } %11869, 1, !dbg !410 + %11872 = extractvalue { i32, i32, i32, i32 } %11869, 2, !dbg !410 + %11873 = extractvalue { i32, i32, i32, i32 } %11869, 3, !dbg !410 + %11874 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11589) #3, !dbg !410 + %11875 = extractvalue { i32, i32, i32, i32 } %11874, 0, !dbg !410 + %11876 = extractvalue { i32, i32, i32, i32 } %11874, 1, !dbg !410 + %11877 = extractvalue { i32, i32, i32, i32 } %11874, 2, !dbg !410 + %11878 = extractvalue { i32, i32, i32, i32 } %11874, 3, !dbg !410 + %11879 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11596) #3, !dbg !410 + %11880 = extractvalue { i32, i32, i32, i32 } %11879, 0, !dbg !410 + %11881 = extractvalue { i32, i32, i32, i32 } %11879, 1, !dbg !410 + %11882 = extractvalue { i32, i32, i32, i32 } %11879, 2, !dbg !410 + %11883 = extractvalue { i32, i32, i32, i32 } %11879, 3, !dbg !410 + %11884 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11603) #3, !dbg !410 + %11885 = extractvalue { i32, i32, i32, i32 } %11884, 0, !dbg !410 + %11886 = extractvalue { i32, i32, i32, i32 } %11884, 1, !dbg !410 + %11887 = extractvalue { i32, i32, i32, i32 } %11884, 2, !dbg !410 + %11888 = extractvalue { i32, i32, i32, i32 } %11884, 3, !dbg !410 + %11889 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11610) #3, !dbg !410 + %11890 = extractvalue { i32, i32, i32, i32 } %11889, 0, !dbg !410 + %11891 = extractvalue { i32, i32, i32, i32 } %11889, 1, !dbg !410 + %11892 = extractvalue { i32, i32, i32, i32 } %11889, 2, !dbg !410 + %11893 = extractvalue { i32, i32, i32, i32 } %11889, 3, !dbg !410 + %11894 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11617) #3, !dbg !410 + %11895 = extractvalue { i32, i32, i32, i32 } %11894, 0, !dbg !410 + %11896 = extractvalue { i32, i32, i32, i32 } %11894, 1, !dbg !410 + %11897 = extractvalue { i32, i32, i32, i32 } %11894, 2, !dbg !410 + %11898 = extractvalue { i32, i32, i32, i32 } %11894, 3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11860, i32 %11861, i32 %11862, i32 %11863, ptr addrspace(1) %11748, i1 %4920) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11865, i32 %11866, i32 %11867, i32 %11868, ptr addrspace(1) %11750, i1 %4921) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11870, i32 %11871, i32 %11872, i32 %11873, ptr addrspace(1) %11752, i1 %4922) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11875, i32 %11876, i32 %11877, i32 %11878, ptr addrspace(1) %11754, i1 %4923) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11880, i32 %11881, i32 %11882, i32 %11883, ptr addrspace(1) %11756, i1 %4924) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11885, i32 %11886, i32 %11887, i32 %11888, ptr addrspace(1) %11758, i1 %4925) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11890, i32 %11891, i32 %11892, i32 %11893, ptr addrspace(1) %11760, i1 %4926) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11895, i32 %11896, i32 %11897, i32 %11898, ptr addrspace(1) %11762, i1 %4927) #3, !dbg !410 + br label %11899, !dbg !39 + +11899: ; preds = %._crit_edge1593, %11364 + ret void, !dbg !411 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #4 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #7 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { convergent nounwind } +attributes #6 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_zeros_1", linkageName: "triton_tem_fused_zeros_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 94, column: 54, scope: !5) +!9 = !DILocation(line: 95, column: 54, scope: !5) +!10 = !DILocation(line: 95, column: 63, scope: !5) +!11 = !DILocation(line: 97, column: 74, scope: !5) +!12 = !DILocation(line: 97, column: 66, scope: !5) +!13 = !DILocation(line: 97, column: 100, scope: !5) +!14 = !DILocation(line: 97, column: 91, scope: !5) +!15 = !DILocation(line: 97, column: 82, scope: !5) +!16 = !DILocation(line: 97, column: 59, scope: !5) +!17 = !DILocation(line: 97, column: 111, scope: !5) +!18 = !DILocation(line: 111, column: 24, scope: !5) +!19 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !5, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!22 = !DILocation(line: 112, column: 36, scope: !5) +!23 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 115, column: 27, scope: !5) +!25 = !DILocation(line: 116, column: 28, scope: !5) +!26 = !DILocation(line: 117, column: 23, scope: !5) +!27 = !DILocation(line: 124, column: 25, scope: !5) +!28 = !DILocation(line: 124, column: 47, scope: !5) +!29 = !DILocation(line: 124, column: 35, scope: !5) +!30 = !DILocation(line: 124, column: 59, scope: !5) +!31 = !DILocation(line: 128, column: 50, scope: !5) +!32 = !DILocation(line: 128, column: 37, scope: !5) +!33 = !DILocation(line: 128, column: 61, scope: !5) +!34 = !DILocation(line: 131, column: 9, scope: !5) +!35 = !DILocation(line: 132, column: 9, scope: !5) +!36 = !DILocation(line: 133, column: 10, scope: !5) +!37 = !DILocation(line: 136, column: 26, scope: !5) +!38 = !DILocation(line: 139, column: 14, scope: !5) +!39 = !DILocation(line: 139, column: 7, scope: !5) +!40 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !41) +!41 = !DILocation(line: 113, column: 34, scope: !5) +!42 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !41) +!43 = !DILocation(line: 140, column: 24, scope: !5) +!44 = !DILocation(line: 144, column: 29, scope: !5) +!45 = !DILocation(line: 144, column: 54, scope: !5) +!46 = !DILocation(line: 144, column: 44, scope: !5) +!47 = !DILocation(line: 145, column: 35, scope: !5) +!48 = !DILocation(line: 154, column: 55, scope: !5) +!49 = !DILocation(line: 154, column: 78, scope: !5) +!50 = !DILocation(line: 155, column: 50, scope: !5) +!51 = !DILocation(line: 155, column: 68, scope: !5) +!52 = !DILocation(line: 158, column: 30, scope: !5) +!53 = !DILocation(line: 158, column: 52, scope: !5) +!54 = !DILocation(line: 158, column: 40, scope: !5) +!55 = !DILocation(line: 158, column: 63, scope: !5) +!56 = !DILocation(line: 159, column: 32, scope: !5) +!57 = !DILocation(line: 159, column: 55, scope: !5) +!58 = !DILocation(line: 159, column: 42, scope: !5) +!59 = !DILocation(line: 159, column: 66, scope: !5) +!60 = !DILocation(line: 161, column: 30, scope: !5) +!61 = !DILocation(line: 161, column: 35, scope: !5) +!62 = !DILocation(line: 161, column: 46, scope: !5) +!63 = !DILocation(line: 161, column: 56, scope: !5) +!64 = !DILocation(line: 163, column: 17, scope: !5) +!65 = !DILocation(line: 164, column: 19, scope: !5) +!66 = !DILocation(line: 167, column: 19, scope: !5) +!67 = !DILocation(line: 168, column: 21, scope: !5) +!68 = !DILocation(line: 169, column: 25, scope: !5) +!69 = !DILocation(line: 174, column: 36, scope: !5) +!70 = !DILocation(line: 175, column: 29, scope: !5) +!71 = !DILocation(line: 825, column: 38, scope: !72, inlinedAt: !73) +!72 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!73 = !DILocation(line: 178, column: 107, scope: !5) +!74 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !73) +!75 = !DILocation(line: 825, column: 56, scope: !72, inlinedAt: !73) +!76 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !73) +!77 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !73) +!78 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !73) +!79 = !DILocation(line: 825, column: 38, scope: !72, inlinedAt: !80) +!80 = !DILocation(line: 179, column: 111, scope: !5) +!81 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !80) +!82 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !80) +!83 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !80) +!84 = !DILocation(line: 188, column: 58, scope: !5) +!85 = !DILocation(line: 188, column: 34, scope: !5) +!86 = !DILocation(line: 188, column: 25, scope: !5) +!87 = !DILocation(line: 189, column: 33, scope: !5) +!88 = !DILocation(line: 189, column: 26, scope: !5) +!89 = !DILocation(line: 190, column: 30, scope: !5) +!90 = !DILocation(line: 190, column: 50, scope: !5) +!91 = !DILocation(line: 195, column: 30, scope: !5) +!92 = !DILocation(line: 196, column: 27, scope: !5) +!93 = !DILocation(line: 196, column: 41, scope: !5) +!94 = !DILocation(line: 197, column: 53, scope: !5) +!95 = !DILocation(line: 197, column: 39, scope: !5) +!96 = !DILocation(line: 199, column: 42, scope: !5) +!97 = !DILocation(line: 199, column: 29, scope: !5) +!98 = !DILocation(line: 390, column: 37, scope: !72, inlinedAt: !99) +!99 = !DILocation(line: 207, column: 12, scope: !5) +!100 = !DILocation(line: 390, column: 18, scope: !72, inlinedAt: !99) +!101 = !DILocation(line: 390, column: 49, scope: !72, inlinedAt: !99) +!102 = !DILocation(line: 391, column: 18, scope: !72, inlinedAt: !99) +!103 = !DILocation(line: 391, column: 49, scope: !72, inlinedAt: !99) +!104 = !DILocation(line: 395, column: 43, scope: !72, inlinedAt: !99) +!105 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !99) +!106 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !99) +!107 = !DILocation(line: 395, column: 101, scope: !72, inlinedAt: !99) +!108 = !DILocation(line: 395, column: 63, scope: !72, inlinedAt: !99) +!109 = !DILocation(line: 485, column: 34, scope: !72, inlinedAt: !99) +!110 = !DILocation(line: 397, column: 28, scope: !72, inlinedAt: !99) +!111 = !DILocation(line: 485, column: 23, scope: !72, inlinedAt: !99) +!112 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !99) +!113 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !99) +!114 = !DILocation(line: 414, column: 19, scope: !72, inlinedAt: !99) +!115 = !DILocation(line: 415, column: 19, scope: !72, inlinedAt: !99) +!116 = !DILocation(line: 417, column: 19, scope: !72, inlinedAt: !99) +!117 = !DILocation(line: 459, column: 19, scope: !72, inlinedAt: !99) +!118 = !DILocation(line: 798, column: 21, scope: !72, inlinedAt: !99) +!119 = !DILocation(line: 487, column: 23, scope: !72, inlinedAt: !99) +!120 = !DILocation(line: 488, column: 23, scope: !72, inlinedAt: !99) +!121 = !DILocation(line: 490, column: 23, scope: !72, inlinedAt: !99) +!122 = !DILocation(line: 482, column: 23, scope: !72, inlinedAt: !99) +!123 = !DILocation(line: 493, column: 24, scope: !72, inlinedAt: !99) +!124 = !DILocation(line: 504, column: 24, scope: !72, inlinedAt: !99) +!125 = !DILocation(line: 531, column: 19, scope: !72, inlinedAt: !99) +!126 = !DILocation(line: 461, column: 14, scope: !72, inlinedAt: !99) +!127 = !DILocation(line: 494, column: 24, scope: !72, inlinedAt: !99) +!128 = !DILocation(line: 496, column: 25, scope: !72, inlinedAt: !99) +!129 = !DILocation(line: 499, column: 25, scope: !72, inlinedAt: !99) +!130 = !DILocation(line: 500, column: 24, scope: !72, inlinedAt: !99) +!131 = !DILocation(line: 502, column: 39, scope: !72, inlinedAt: !99) +!132 = !DILocation(line: 505, column: 24, scope: !72, inlinedAt: !99) +!133 = !DILocation(line: 506, column: 23, scope: !72, inlinedAt: !99) +!134 = !DILocation(line: 507, column: 25, scope: !72, inlinedAt: !99) +!135 = !DILocation(line: 508, column: 25, scope: !72, inlinedAt: !99) +!136 = !DILocation(line: 510, column: 25, scope: !72, inlinedAt: !99) +!137 = !DILocation(line: 511, column: 24, scope: !72, inlinedAt: !99) +!138 = !DILocation(line: 513, column: 39, scope: !72, inlinedAt: !99) +!139 = !DILocation(line: 514, column: 25, scope: !72, inlinedAt: !99) +!140 = !DILocation(line: 515, column: 24, scope: !72, inlinedAt: !99) +!141 = !DILocation(line: 516, column: 24, scope: !72, inlinedAt: !99) +!142 = !DILocation(line: 521, column: 69, scope: !72, inlinedAt: !99) +!143 = !DILocation(line: 524, column: 27, scope: !72, inlinedAt: !99) +!144 = !DILocation(line: 525, column: 39, scope: !72, inlinedAt: !99) +!145 = !DILocation(line: 525, column: 21, scope: !72, inlinedAt: !99) +!146 = !DILocation(line: 530, column: 20, scope: !72, inlinedAt: !99) +!147 = !DILocation(line: 531, column: 14, scope: !72, inlinedAt: !99) +!148 = !DILocation(line: 551, column: 15, scope: !72, inlinedAt: !99) +!149 = !DILocation(line: 549, column: 43, scope: !72, inlinedAt: !99) +!150 = !DILocation(line: 553, column: 21, scope: !72, inlinedAt: !99) +!151 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !99) +!152 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !99) +!153 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !99) +!154 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !99) +!155 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !99) +!156 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !99) +!157 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !99) +!158 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !99) +!159 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !99) +!160 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !99) +!161 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !99) +!162 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !99) +!163 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !99) +!164 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !99) +!165 = !DILocation(line: 414, column: 28, scope: !72, inlinedAt: !99) +!166 = !DILocation(line: 214, column: 39, scope: !5) +!167 = !DILocation(line: 215, column: 31, scope: !5) +!168 = !DILocation(line: 215, column: 45, scope: !5) +!169 = !DILocation(line: 216, column: 62, scope: !5) +!170 = !DILocation(line: 216, column: 43, scope: !5) +!171 = !DILocation(line: 218, column: 33, scope: !5) +!172 = !DILocation(line: 390, column: 37, scope: !72, inlinedAt: !173) +!173 = !DILocation(line: 226, column: 16, scope: !5) +!174 = !DILocation(line: 390, column: 18, scope: !72, inlinedAt: !173) +!175 = !DILocation(line: 390, column: 49, scope: !72, inlinedAt: !173) +!176 = !DILocation(line: 391, column: 18, scope: !72, inlinedAt: !173) +!177 = !DILocation(line: 391, column: 49, scope: !72, inlinedAt: !173) +!178 = !DILocation(line: 395, column: 43, scope: !72, inlinedAt: !173) +!179 = !DILocation(line: 395, column: 63, scope: !72, inlinedAt: !173) +!180 = !DILocation(line: 397, column: 28, scope: !72, inlinedAt: !173) +!181 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !173) +!182 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !173) +!183 = !DILocation(line: 414, column: 19, scope: !72, inlinedAt: !173) +!184 = !DILocation(line: 415, column: 19, scope: !72, inlinedAt: !173) +!185 = !DILocation(line: 417, column: 19, scope: !72, inlinedAt: !173) +!186 = !DILocation(line: 459, column: 19, scope: !72, inlinedAt: !173) +!187 = !DILocation(line: 461, column: 14, scope: !72, inlinedAt: !173) +!188 = !DILocation(line: 524, column: 27, scope: !72, inlinedAt: !173) +!189 = !DILocation(line: 476, column: 79, scope: !72, inlinedAt: !173) +!190 = !DILocation(line: 525, column: 39, scope: !72, inlinedAt: !173) +!191 = !DILocation(line: 525, column: 21, scope: !72, inlinedAt: !173) +!192 = !DILocation(line: 530, column: 20, scope: !72, inlinedAt: !173) +!193 = !DILocation(line: 531, column: 19, scope: !72, inlinedAt: !173) +!194 = !DILocation(line: 531, column: 14, scope: !72, inlinedAt: !173) +!195 = !DILocation(line: 551, column: 15, scope: !72, inlinedAt: !173) +!196 = !DILocation(line: 538, column: 71, scope: !72, inlinedAt: !173) +!197 = !DILocation(line: 553, column: 21, scope: !72, inlinedAt: !173) +!198 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !173) +!199 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !173) +!200 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !173) +!201 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !173) +!202 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !173) +!203 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !173) +!204 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !173) +!205 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !173) +!206 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !173) +!207 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !173) +!208 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !173) +!209 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !173) +!210 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !173) +!211 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !173) +!212 = !DILocation(line: 414, column: 28, scope: !72, inlinedAt: !173) +!213 = !DILocation(line: 231, column: 24, scope: !5) +!214 = !DILocation(line: 231, column: 56, scope: !5) +!215 = !DILocation(line: 232, column: 14, scope: !5) +!216 = !DILocation(line: 236, column: 30, scope: !5) +!217 = !DILocation(line: 252, column: 25, scope: !5) +!218 = !DILocation(line: 253, column: 29, scope: !5) +!219 = !DILocation(line: 825, column: 38, scope: !72, inlinedAt: !220) +!220 = !DILocation(line: 256, column: 107, scope: !5) +!221 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !220) +!222 = !DILocation(line: 825, column: 56, scope: !72, inlinedAt: !220) +!223 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !220) +!224 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !220) +!225 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !220) +!226 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !227) +!227 = !DILocation(line: 257, column: 107, scope: !5) +!228 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !227) +!229 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !227) +!230 = !DILocation(line: 263, column: 32, scope: !5) +!231 = !DILocation(line: 266, column: 56, scope: !5) +!232 = !DILocation(line: 267, column: 59, scope: !5) +!233 = !DILocation(line: 269, column: 34, scope: !5) +!234 = !DILocation(line: 281, column: 58, scope: !5) +!235 = !DILocation(line: 281, column: 80, scope: !5) +!236 = !DILocation(line: 282, column: 53, scope: !5) +!237 = !DILocation(line: 282, column: 70, scope: !5) +!238 = !DILocation(line: 286, column: 32, scope: !5) +!239 = !DILocation(line: 287, column: 30, scope: !5) +!240 = !DILocation(line: 287, column: 43, scope: !5) +!241 = !DILocation(line: 288, column: 55, scope: !5) +!242 = !DILocation(line: 288, column: 42, scope: !5) +!243 = !DILocation(line: 290, column: 45, scope: !5) +!244 = !DILocation(line: 290, column: 32, scope: !5) +!245 = !DILocation(line: 601, column: 37, scope: !72, inlinedAt: !246) +!246 = !DILocation(line: 298, column: 16, scope: !5) +!247 = !DILocation(line: 602, column: 38, scope: !72, inlinedAt: !246) +!248 = !DILocation(line: 608, column: 42, scope: !72, inlinedAt: !246) +!249 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !246) +!250 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !246) +!251 = !DILocation(line: 608, column: 98, scope: !72, inlinedAt: !246) +!252 = !DILocation(line: 608, column: 61, scope: !72, inlinedAt: !246) +!253 = !DILocation(line: 798, column: 21, scope: !72, inlinedAt: !246) +!254 = !DILocation(line: 701, column: 35, scope: !72, inlinedAt: !246) +!255 = !DILocation(line: 610, column: 28, scope: !72, inlinedAt: !246) +!256 = !DILocation(line: 701, column: 24, scope: !72, inlinedAt: !246) +!257 = !DILocation(line: 710, column: 25, scope: !72, inlinedAt: !246) +!258 = !DILocation(line: 709, column: 25, scope: !72, inlinedAt: !246) +!259 = !DILocation(line: 712, column: 25, scope: !72, inlinedAt: !246) +!260 = !DILocation(line: 718, column: 39, scope: !72, inlinedAt: !246) +!261 = !DILocation(line: 719, column: 25, scope: !72, inlinedAt: !246) +!262 = !DILocation(line: 720, column: 24, scope: !72, inlinedAt: !246) +!263 = !DILocation(line: 721, column: 24, scope: !72, inlinedAt: !246) +!264 = !DILocation(line: 306, column: 41, scope: !5) +!265 = !DILocation(line: 307, column: 34, scope: !5) +!266 = !DILocation(line: 307, column: 47, scope: !5) +!267 = !DILocation(line: 308, column: 64, scope: !5) +!268 = !DILocation(line: 308, column: 46, scope: !5) +!269 = !DILocation(line: 310, column: 36, scope: !5) +!270 = !DILocation(line: 601, column: 37, scope: !72, inlinedAt: !271) +!271 = !DILocation(line: 318, column: 20, scope: !5) +!272 = !DILocation(line: 602, column: 38, scope: !72, inlinedAt: !271) +!273 = !DILocation(line: 608, column: 42, scope: !72, inlinedAt: !271) +!274 = !DILocation(line: 608, column: 61, scope: !72, inlinedAt: !271) +!275 = !DILocation(line: 676, column: 20, scope: !72, inlinedAt: !246) +!276 = !DILocation(line: 262, column: 30, scope: !5) +!277 = !DILocation(line: 263, column: 51, scope: !5) +!278 = !DILocation(line: 266, column: 44, scope: !5) +!279 = !DILocation(line: 266, column: 67, scope: !5) +!280 = !DILocation(line: 267, column: 36, scope: !5) +!281 = !DILocation(line: 267, column: 46, scope: !5) +!282 = !DILocation(line: 267, column: 70, scope: !5) +!283 = !DILocation(line: 269, column: 50, scope: !5) +!284 = !DILocation(line: 269, column: 60, scope: !5) +!285 = !DILocation(line: 271, column: 21, scope: !5) +!286 = !DILocation(line: 272, column: 23, scope: !5) +!287 = !DILocation(line: 275, column: 25, scope: !5) +!288 = !DILocation(line: 276, column: 29, scope: !5) +!289 = !DILocation(line: 601, column: 18, scope: !72, inlinedAt: !246) +!290 = !DILocation(line: 601, column: 49, scope: !72, inlinedAt: !246) +!291 = !DILocation(line: 602, column: 19, scope: !72, inlinedAt: !246) +!292 = !DILocation(line: 602, column: 51, scope: !72, inlinedAt: !246) +!293 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !246) +!294 = !DILocation(line: 674, column: 28, scope: !72, inlinedAt: !246) +!295 = !DILocation(line: 674, column: 22, scope: !72, inlinedAt: !246) +!296 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !246) +!297 = !DILocation(line: 748, column: 29, scope: !72, inlinedAt: !246) +!298 = !DILocation(line: 748, column: 21, scope: !72, inlinedAt: !246) +!299 = !DILocation(line: 626, column: 19, scope: !72, inlinedAt: !246) +!300 = !DILocation(line: 627, column: 19, scope: !72, inlinedAt: !246) +!301 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !246) +!302 = !DILocation(line: 675, column: 26, scope: !72, inlinedAt: !246) +!303 = !DILocation(line: 675, column: 46, scope: !72, inlinedAt: !246) +!304 = !DILocation(line: 678, column: 15, scope: !72, inlinedAt: !246) +!305 = !DILocation(line: 698, column: 25, scope: !72, inlinedAt: !246) +!306 = !DILocation(line: 703, column: 25, scope: !72, inlinedAt: !246) +!307 = !DILocation(line: 704, column: 24, scope: !72, inlinedAt: !246) +!308 = !DILocation(line: 706, column: 24, scope: !72, inlinedAt: !246) +!309 = !DILocation(line: 722, column: 24, scope: !72, inlinedAt: !246) +!310 = !DILocation(line: 723, column: 25, scope: !72, inlinedAt: !246) +!311 = !DILocation(line: 724, column: 25, scope: !72, inlinedAt: !246) +!312 = !DILocation(line: 726, column: 25, scope: !72, inlinedAt: !246) +!313 = !DILocation(line: 730, column: 25, scope: !72, inlinedAt: !246) +!314 = !DILocation(line: 727, column: 24, scope: !72, inlinedAt: !246) +!315 = !DILocation(line: 729, column: 39, scope: !72, inlinedAt: !246) +!316 = !DILocation(line: 731, column: 24, scope: !72, inlinedAt: !246) +!317 = !DILocation(line: 732, column: 24, scope: !72, inlinedAt: !246) +!318 = !DILocation(line: 736, column: 69, scope: !72, inlinedAt: !246) +!319 = !DILocation(line: 739, column: 27, scope: !72, inlinedAt: !246) +!320 = !DILocation(line: 740, column: 40, scope: !72, inlinedAt: !246) +!321 = !DILocation(line: 740, column: 22, scope: !72, inlinedAt: !246) +!322 = !DILocation(line: 744, column: 24, scope: !72, inlinedAt: !246) +!323 = !DILocation(line: 744, column: 43, scope: !72, inlinedAt: !246) +!324 = !DILocation(line: 750, column: 20, scope: !72, inlinedAt: !246) +!325 = !DILocation(line: 775, column: 43, scope: !72, inlinedAt: !246) +!326 = !DILocation(line: 628, column: 19, scope: !72, inlinedAt: !246) +!327 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !246) +!328 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !246) +!329 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !246) +!330 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !246) +!331 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !246) +!332 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !246) +!333 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !246) +!334 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !246) +!335 = !DILocation(line: 751, column: 22, scope: !72, inlinedAt: !246) +!336 = !DILocation(line: 751, column: 16, scope: !72, inlinedAt: !246) +!337 = !DILocation(line: 775, column: 24, scope: !72, inlinedAt: !246) +!338 = !DILocation(line: 773, column: 45, scope: !72, inlinedAt: !246) +!339 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !246) +!340 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !246) +!341 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !246) +!342 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !246) +!343 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !246) +!344 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !246) +!345 = !DILocation(line: 626, column: 28, scope: !72, inlinedAt: !246) +!346 = !DILocation(line: 627, column: 28, scope: !72, inlinedAt: !246) +!347 = !DILocation(line: 674, column: 52, scope: !72, inlinedAt: !246) +!348 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !246) +!349 = !DILocation(line: 601, column: 18, scope: !72, inlinedAt: !271) +!350 = !DILocation(line: 601, column: 49, scope: !72, inlinedAt: !271) +!351 = !DILocation(line: 602, column: 19, scope: !72, inlinedAt: !271) +!352 = !DILocation(line: 602, column: 51, scope: !72, inlinedAt: !271) +!353 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !271) +!354 = !DILocation(line: 674, column: 28, scope: !72, inlinedAt: !271) +!355 = !DILocation(line: 674, column: 22, scope: !72, inlinedAt: !271) +!356 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !271) +!357 = !DILocation(line: 748, column: 29, scope: !72, inlinedAt: !271) +!358 = !DILocation(line: 748, column: 21, scope: !72, inlinedAt: !271) +!359 = !DILocation(line: 626, column: 19, scope: !72, inlinedAt: !271) +!360 = !DILocation(line: 627, column: 19, scope: !72, inlinedAt: !271) +!361 = !DILocation(line: 610, column: 28, scope: !72, inlinedAt: !271) +!362 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !271) +!363 = !DILocation(line: 675, column: 26, scope: !72, inlinedAt: !271) +!364 = !DILocation(line: 675, column: 46, scope: !72, inlinedAt: !271) +!365 = !DILocation(line: 676, column: 20, scope: !72, inlinedAt: !271) +!366 = !DILocation(line: 678, column: 15, scope: !72, inlinedAt: !271) +!367 = !DILocation(line: 739, column: 27, scope: !72, inlinedAt: !271) +!368 = !DILocation(line: 692, column: 78, scope: !72, inlinedAt: !271) +!369 = !DILocation(line: 740, column: 40, scope: !72, inlinedAt: !271) +!370 = !DILocation(line: 740, column: 22, scope: !72, inlinedAt: !271) +!371 = !DILocation(line: 744, column: 24, scope: !72, inlinedAt: !271) +!372 = !DILocation(line: 744, column: 43, scope: !72, inlinedAt: !271) +!373 = !DILocation(line: 750, column: 20, scope: !72, inlinedAt: !271) +!374 = !DILocation(line: 751, column: 22, scope: !72, inlinedAt: !271) +!375 = !DILocation(line: 751, column: 16, scope: !72, inlinedAt: !271) +!376 = !DILocation(line: 775, column: 24, scope: !72, inlinedAt: !271) +!377 = !DILocation(line: 759, column: 70, scope: !72, inlinedAt: !271) +!378 = !DILocation(line: 775, column: 43, scope: !72, inlinedAt: !271) +!379 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !271) +!380 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !271) +!381 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !271) +!382 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !271) +!383 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !271) +!384 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !271) +!385 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !271) +!386 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !271) +!387 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !271) +!388 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !271) +!389 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !271) +!390 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !271) +!391 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !271) +!392 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !271) +!393 = !DILocation(line: 626, column: 28, scope: !72, inlinedAt: !271) +!394 = !DILocation(line: 627, column: 28, scope: !72, inlinedAt: !271) +!395 = !DILocation(line: 628, column: 19, scope: !72, inlinedAt: !271) +!396 = !DILocation(line: 674, column: 52, scope: !72, inlinedAt: !271) +!397 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !271) +!398 = !DILocation(line: 323, column: 23, scope: !5) +!399 = !DILocation(line: 323, column: 55, scope: !5) +!400 = !DILocation(line: 332, column: 30, scope: !5) +!401 = !DILocation(line: 334, column: 14, scope: !5) +!402 = !DILocation(line: 344, column: 27, scope: !5) +!403 = !DILocation(line: 344, column: 45, scope: !5) +!404 = !DILocation(line: 344, column: 53, scope: !5) +!405 = !DILocation(line: 344, column: 41, scope: !5) +!406 = !DILocation(line: 344, column: 64, scope: !5) +!407 = !DILocation(line: 344, column: 71, scope: !5) +!408 = !DILocation(line: 344, column: 59, scope: !5) +!409 = !DILocation(line: 345, column: 29, scope: !5) +!410 = !DILocation(line: 345, column: 69, scope: !5) +!411 = !DILocation(line: 139, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1d7bf124d053af14cc84089518b171b7b143b359 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ptx @@ -0,0 +1,9592 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_zeros_1 // -- Begin function triton_tem_fused_zeros_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_zeros_1 +.visible .entry triton_tem_fused_zeros_1( + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_14, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_15, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_16, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_17, + .param .u32 triton_tem_fused_zeros_1_param_18, + .param .u32 triton_tem_fused_zeros_1_param_19, + .param .u32 triton_tem_fused_zeros_1_param_20, + .param .u32 triton_tem_fused_zeros_1_param_21, + .param .u32 triton_tem_fused_zeros_1_param_22, + .param .u32 triton_tem_fused_zeros_1_param_23, + .param .u32 triton_tem_fused_zeros_1_param_24, + .param .u32 triton_tem_fused_zeros_1_param_25, + .param .u32 triton_tem_fused_zeros_1_param_26, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_27, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_28 +) +.reqntid 256 +{ + .reg .pred %p<1179>; + .reg .b16 %rs<257>; + .reg .b32 %r<15500>; + .reg .b64 %rd<1210>; + .loc 1 18 0 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:18:0 + +// %bb.0: + ld.param.b32 %r2366, [triton_tem_fused_zeros_1_param_26]; + ld.param.b32 %r2359, [triton_tem_fused_zeros_1_param_19]; + ld.param.b32 %r2358, [triton_tem_fused_zeros_1_param_18]; + ld.param.b64 %rd207, [triton_tem_fused_zeros_1_param_16]; + ld.param.b64 %rd197, [triton_tem_fused_zeros_1_param_5]; + ld.param.b64 %rd196, [triton_tem_fused_zeros_1_param_4]; + ld.param.b64 %rd195, [triton_tem_fused_zeros_1_param_3]; + ld.param.b64 %rd194, [triton_tem_fused_zeros_1_param_0]; +$L__tmp0: + .loc 1 94 54 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:94:54 + shl.b32 %r1, %r2358, 12; + ld.param.b64 %rd209, [triton_tem_fused_zeros_1_param_1]; + .loc 1 95 54 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:95:54 + shl.b32 %r2367, %r2359, 10; + ld.param.b64 %rd210, [triton_tem_fused_zeros_1_param_2]; + .loc 1 97 74 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:74 + setp.lt.s32 %p11, %r2358, 2; + .loc 1 97 66 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:66 + selp.b32 %r2368, 1, 0, %p11; + .loc 1 97 100 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:100 + setp.gt.s32 %p12, %r2358, 1; + .loc 1 97 91 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:91 + selp.b32 %r2369, %r2358, 0, %p12; + .loc 1 97 82 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:82 + add.s32 %r2370, %r2369, %r2368; + .loc 1 97 59 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:59 + shl.b32 %r2, %r2370, 12; + .loc 1 97 111 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:97:111 + shl.b32 %r3, %r2370, 7; + .loc 1 111 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:111:24 + mov.u32 %r4, %ctaid.x; +$L__tmp1: + .loc 2 41 22 // standard.py:41:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:112:36 ] + add.s32 %r2371, %r2359, 127; + .loc 2 41 28 // standard.py:41:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:112:36 ] + shr.s32 %r2372, %r2371, 31; + shr.u32 %r2373, %r2372, 25; + add.s32 %r2374, %r2371, %r2373; + shr.s32 %r5, %r2374, 7; +$L__tmp2: + .loc 1 115 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:115:27 + mov.u32 %r6, %ctaid.y; + .loc 1 116 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:116:28 + mov.u32 %r7, %ctaid.z; + .loc 1 117 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:117:23 + and.b32 %r8, %r6, 7; + .loc 1 124 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:124:25 + mul.lo.s32 %r2375, %r2359, %r7; + shl.b32 %r2376, %r2375, 7; + .loc 1 124 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:124:35 + mad.lo.s32 %r2377, %r2367, %r8, %r2376; + .loc 1 131 9 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:131:9 + mul.wide.s32 %rd212, %r2377, 2; + add.s64 %rd1, %rd209, %rd212; + .loc 1 132 9 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:132:9 + add.s64 %rd2, %rd210, %rd212; + .loc 1 136 26 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:136:26 + mov.u32 %r10, %tid.x; + shr.u32 %r11, %r10, 5; + and.b32 %r12, %r10, 240; + bfe.u32 %r13, %r10, 4, 4; + or.b32 %r14, %r13, 16; + or.b32 %r15, %r13, 32; + or.b32 %r16, %r13, 48; + or.b32 %r17, %r13, 64; + or.b32 %r18, %r13, 80; + or.b32 %r19, %r13, 96; + or.b32 %r20, %r13, 112; + shr.u32 %r2378, %r10, 1; + and.b32 %r2379, %r2378, 112; + bfe.u32 %r2380, %r10, 2, 3; + or.b32 %r21, %r2379, %r2380; + or.b32 %r22, %r21, 8; + .loc 1 139 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:139:14 + setp.lt.s32 %p13, %r4, %r5; + .loc 1 139 7 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:139:7 + @%p13 bra $L__BB0_8; + bra.uni $L__BB0_1; +$L__BB0_8: + .loc 1 0 7 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0:7 + ld.param.b32 %r2365, [triton_tem_fused_zeros_1_param_25]; + ld.param.b32 %r2364, [triton_tem_fused_zeros_1_param_24]; + ld.param.b32 %r2363, [triton_tem_fused_zeros_1_param_23]; + ld.param.b64 %rd208, [triton_tem_fused_zeros_1_param_17]; + ld.param.b64 %rd206, [triton_tem_fused_zeros_1_param_15]; + ld.param.b64 %rd205, [triton_tem_fused_zeros_1_param_14]; + ld.param.b64 %rd202, [triton_tem_fused_zeros_1_param_11]; + ld.param.b64 %rd201, [triton_tem_fused_zeros_1_param_10]; + ld.param.b64 %rd211, [triton_tem_fused_zeros_1_param_7]; + mad.lo.s32 %r9, %r2367, %r6, %r2376; + mad.wide.s32 %rd3, %r9, 2, %rd211; + .loc 1 252 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:252:25 + shl.b32 %r7565, %r4, 7; + .loc 1 253 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:253:29 + or.b32 %r678, %r13, %r7565; + or.b32 %r679, %r14, %r7565; + or.b32 %r680, %r15, %r7565; + or.b32 %r681, %r16, %r7565; + or.b32 %r682, %r17, %r7565; + or.b32 %r683, %r18, %r7565; + or.b32 %r684, %r19, %r7565; + or.b32 %r685, %r20, %r7565; +$L__tmp3: + .loc 1 825 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:256:107 ] + shl.b32 %r7566, %r678, 7; + shl.b32 %r7567, %r679, 7; + shl.b32 %r7568, %r680, 7; + shl.b32 %r7569, %r681, 7; + shl.b32 %r7570, %r682, 7; + shl.b32 %r7571, %r683, 7; + shl.b32 %r7572, %r684, 7; + shl.b32 %r7573, %r685, 7; + .loc 1 825 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:256:107 ] + cvt.s64.s32 %rd71, %r7566; + mul.wide.s32 %rd566, %r7566, 2; + add.s64 %rd567, %rd1, %rd566; + cvt.s64.s32 %rd72, %r7567; + mul.wide.s32 %rd568, %r7567, 2; + add.s64 %rd569, %rd1, %rd568; + cvt.s64.s32 %rd73, %r7568; + mul.wide.s32 %rd570, %r7568, 2; + add.s64 %rd571, %rd1, %rd570; + cvt.s64.s32 %rd74, %r7569; + mul.wide.s32 %rd572, %r7569, 2; + add.s64 %rd573, %rd1, %rd572; + cvt.s64.s32 %rd75, %r7570; + mul.wide.s32 %rd574, %r7570, 2; + add.s64 %rd575, %rd1, %rd574; + cvt.s64.s32 %rd76, %r7571; + mul.wide.s32 %rd576, %r7571, 2; + add.s64 %rd577, %rd1, %rd576; + cvt.s64.s32 %rd77, %r7572; + mul.wide.s32 %rd578, %r7572, 2; + add.s64 %rd579, %rd1, %rd578; + cvt.s64.s32 %rd78, %r7573; + mul.wide.s32 %rd580, %r7573, 2; + add.s64 %rd581, %rd1, %rd580; + .loc 1 825 56 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:56 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:256:107 ] + shl.b32 %r7574, %r10, 3; + and.b32 %r7575, %r7574, 120; + .loc 1 825 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:256:107 ] + cvt.u64.u32 %rd79, %r7575; + mul.wide.u32 %rd582, %r7575, 2; + add.s64 %rd543, %rd567, %rd582; + add.s64 %rd544, %rd569, %rd582; + add.s64 %rd545, %rd571, %rd582; + add.s64 %rd546, %rd573, %rd582; + add.s64 %rd547, %rd575, %rd582; + add.s64 %rd548, %rd577, %rd582; + add.s64 %rd549, %rd579, %rd582; + add.s64 %rd550, %rd581, %rd582; + .loc 1 833 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:256:107 ] + setp.lt.s32 %p1163, %r678, %r2359; + setp.lt.s32 %p1164, %r679, %r2359; + setp.lt.s32 %p1165, %r680, %r2359; + setp.lt.s32 %p1166, %r681, %r2359; + setp.lt.s32 %p1167, %r682, %r2359; + setp.lt.s32 %p1168, %r683, %r2359; + setp.lt.s32 %p1169, %r684, %r2359; + setp.lt.s32 %p1170, %r685, %r2359; + mov.b32 %r7436, 0; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:256:107 ] + // begin inline asm + mov.u32 %r7432, %r7436; + mov.u32 %r7433, %r7436; + mov.u32 %r7434, %r7436; + mov.u32 %r7435, %r7436; + @%p1163 ld.global.v4.b32 { %r7432, %r7433, %r7434, %r7435 }, [ %rd543 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7440, %r7436; + mov.u32 %r7441, %r7436; + mov.u32 %r7442, %r7436; + mov.u32 %r7443, %r7436; + @%p1164 ld.global.v4.b32 { %r7440, %r7441, %r7442, %r7443 }, [ %rd544 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7448, %r7436; + mov.u32 %r7449, %r7436; + mov.u32 %r7450, %r7436; + mov.u32 %r7451, %r7436; + @%p1165 ld.global.v4.b32 { %r7448, %r7449, %r7450, %r7451 }, [ %rd545 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7456, %r7436; + mov.u32 %r7457, %r7436; + mov.u32 %r7458, %r7436; + mov.u32 %r7459, %r7436; + @%p1166 ld.global.v4.b32 { %r7456, %r7457, %r7458, %r7459 }, [ %rd546 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7464, %r7436; + mov.u32 %r7465, %r7436; + mov.u32 %r7466, %r7436; + mov.u32 %r7467, %r7436; + @%p1167 ld.global.v4.b32 { %r7464, %r7465, %r7466, %r7467 }, [ %rd547 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7472, %r7436; + mov.u32 %r7473, %r7436; + mov.u32 %r7474, %r7436; + mov.u32 %r7475, %r7436; + @%p1168 ld.global.v4.b32 { %r7472, %r7473, %r7474, %r7475 }, [ %rd548 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7480, %r7436; + mov.u32 %r7481, %r7436; + mov.u32 %r7482, %r7436; + mov.u32 %r7483, %r7436; + @%p1169 ld.global.v4.b32 { %r7480, %r7481, %r7482, %r7483 }, [ %rd549 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7488, %r7436; + mov.u32 %r7489, %r7436; + mov.u32 %r7490, %r7436; + mov.u32 %r7491, %r7436; + @%p1170 ld.global.v4.b32 { %r7488, %r7489, %r7490, %r7491 }, [ %rd550 + 0 ]; + // end inline asm + shl.b32 %r7576, %r10, 4; + and.b32 %r7577, %r7576, 112; + shl.b32 %r7578, %r12, 3; + and.b32 %r7579, %r10, 112; + and.b32 %r7580, %r10, 8; + shl.b32 %r7581, %r7580, 11; + or.b32 %r7582, %r7577, %r7578; + xor.b32 %r7583, %r7582, %r7579; + or.b32 %r7584, %r7583, %r7581; + mov.b32 %r7585, global_smem; + add.s32 %r7586, %r7585, %r7584; + st.shared.v4.b32 [%r7586+99328], {%r7432, %r7433, %r7434, %r7435}; + st.shared.v4.b32 [%r7586+101376], {%r7440, %r7441, %r7442, %r7443}; + st.shared.v4.b32 [%r7586+103424], {%r7448, %r7449, %r7450, %r7451}; + st.shared.v4.b32 [%r7586+105472], {%r7456, %r7457, %r7458, %r7459}; + st.shared.v4.b32 [%r7586+107520], {%r7464, %r7465, %r7466, %r7467}; + st.shared.v4.b32 [%r7586+109568], {%r7472, %r7473, %r7474, %r7475}; + st.shared.v4.b32 [%r7586+111616], {%r7480, %r7481, %r7482, %r7483}; + st.shared.v4.b32 [%r7586+113664], {%r7488, %r7489, %r7490, %r7491}; +$L__tmp4: + .loc 1 825 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:257:107 ] + add.s64 %rd583, %rd2, %rd566; + add.s64 %rd584, %rd2, %rd568; + add.s64 %rd585, %rd2, %rd570; + add.s64 %rd586, %rd2, %rd572; + add.s64 %rd587, %rd2, %rd574; + add.s64 %rd588, %rd2, %rd576; + add.s64 %rd589, %rd2, %rd578; + add.s64 %rd590, %rd2, %rd580; + .loc 1 825 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:257:107 ] + add.s64 %rd551, %rd583, %rd582; + add.s64 %rd552, %rd584, %rd582; + add.s64 %rd553, %rd585, %rd582; + add.s64 %rd554, %rd586, %rd582; + add.s64 %rd555, %rd587, %rd582; + add.s64 %rd556, %rd588, %rd582; + add.s64 %rd557, %rd589, %rd582; + add.s64 %rd558, %rd590, %rd582; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:257:107 ] + // begin inline asm + mov.u32 %r7496, %r7436; + mov.u32 %r7497, %r7436; + mov.u32 %r7498, %r7436; + mov.u32 %r7499, %r7436; + @%p1163 ld.global.v4.b32 { %r7496, %r7497, %r7498, %r7499 }, [ %rd551 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7504, %r7436; + mov.u32 %r7505, %r7436; + mov.u32 %r7506, %r7436; + mov.u32 %r7507, %r7436; + @%p1164 ld.global.v4.b32 { %r7504, %r7505, %r7506, %r7507 }, [ %rd552 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7512, %r7436; + mov.u32 %r7513, %r7436; + mov.u32 %r7514, %r7436; + mov.u32 %r7515, %r7436; + @%p1165 ld.global.v4.b32 { %r7512, %r7513, %r7514, %r7515 }, [ %rd553 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7520, %r7436; + mov.u32 %r7521, %r7436; + mov.u32 %r7522, %r7436; + mov.u32 %r7523, %r7436; + @%p1166 ld.global.v4.b32 { %r7520, %r7521, %r7522, %r7523 }, [ %rd554 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7528, %r7436; + mov.u32 %r7529, %r7436; + mov.u32 %r7530, %r7436; + mov.u32 %r7531, %r7436; + @%p1167 ld.global.v4.b32 { %r7528, %r7529, %r7530, %r7531 }, [ %rd555 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7536, %r7436; + mov.u32 %r7537, %r7436; + mov.u32 %r7538, %r7436; + mov.u32 %r7539, %r7436; + @%p1168 ld.global.v4.b32 { %r7536, %r7537, %r7538, %r7539 }, [ %rd556 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7544, %r7436; + mov.u32 %r7545, %r7436; + mov.u32 %r7546, %r7436; + mov.u32 %r7547, %r7436; + @%p1169 ld.global.v4.b32 { %r7544, %r7545, %r7546, %r7547 }, [ %rd557 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7552, %r7436; + mov.u32 %r7553, %r7436; + mov.u32 %r7554, %r7436; + mov.u32 %r7555, %r7436; + @%p1170 ld.global.v4.b32 { %r7552, %r7553, %r7554, %r7555 }, [ %rd558 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r7586+132096], {%r7496, %r7497, %r7498, %r7499}; + st.shared.v4.b32 [%r7586+134144], {%r7504, %r7505, %r7506, %r7507}; + st.shared.v4.b32 [%r7586+136192], {%r7512, %r7513, %r7514, %r7515}; + st.shared.v4.b32 [%r7586+138240], {%r7520, %r7521, %r7522, %r7523}; + st.shared.v4.b32 [%r7586+140288], {%r7528, %r7529, %r7530, %r7531}; + st.shared.v4.b32 [%r7586+142336], {%r7536, %r7537, %r7538, %r7539}; + st.shared.v4.b32 [%r7586+144384], {%r7544, %r7545, %r7546, %r7547}; + st.shared.v4.b32 [%r7586+146432], {%r7552, %r7553, %r7554, %r7555}; +$L__tmp5: + .loc 1 266 56 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:266:56 + mul.lo.s32 %r686, %r1, %r6; + .loc 1 267 59 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:267:59 + mul.lo.s32 %r687, %r2, %r6; + .loc 1 269 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:269:34 + shl.b32 %r688, %r6, 5; + .loc 1 281 80 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:281:80 + mad.lo.s32 %r7587, %r2363, %r8, %r4; + .loc 1 282 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:282:70 + mad.lo.s32 %r7588, %r2365, %r8, %r4; + mul.lo.s32 %r7589, %r7588, %r2364; + .loc 1 286 32 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:286:32 + mul.wide.s32 %rd591, %r7589, 4; + add.s64 %rd559, %rd202, %rd591; + mov.pred %p460, -1; + .loc 1 287 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:287:30 + // begin inline asm + mov.u32 %r7560, 0x0; + @%p460 ld.global.b32 { %r7560 }, [ %rd559 + 0 ]; + // end inline asm + .loc 1 287 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:287:43 + shl.b32 %r689, %r7560, 7; + .loc 1 288 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:288:55 + mul.wide.s32 %rd592, %r7587, 4; + add.s64 %rd560, %rd201, %rd592; + .loc 1 288 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:288:42 + // begin inline asm + mov.u32 %r7561, 0x0; + @%p460 ld.global.b32 { %r7561 }, [ %rd560 + 0 ]; + // end inline asm + .loc 1 290 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:45 + and.b32 %r691, %r10, 3; + shl.b32 %r692, %r691, 1; + or.b32 %r7590, %r692, 1; + or.b32 %r7591, %r692, 8; + or.b32 %r7592, %r692, 9; + .loc 1 290 32 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:32 + or.b32 %r693, %r689, %r7591; + or.b32 %r694, %r689, %r7592; + .loc 1 290 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:45 + or.b32 %r7593, %r692, 16; + or.b32 %r7594, %r692, 17; + or.b32 %r7595, %r692, 24; + or.b32 %r7596, %r692, 25; + .loc 1 290 32 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:32 + or.b32 %r757, %r689, %r7596; + or.b32 %r756, %r689, %r7595; + or.b32 %r755, %r689, %r7594; + or.b32 %r754, %r689, %r7593; + or.b32 %r7597, %r689, %r13; + or.b32 %r7598, %r689, %r14; + or.b32 %r7599, %r689, %r15; + or.b32 %r7600, %r689, %r16; +$L__tmp6: + .loc 1 601 37 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:601:37 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r7601, %r7597, 12; + shl.b32 %r7602, %r7598, 12; + shl.b32 %r7603, %r7599, 12; + shl.b32 %r7604, %r7600, 12; + .loc 1 602 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:602:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r7605, %r7597, 7; + shl.b32 %r7606, %r7598, 7; + shl.b32 %r7607, %r7599, 7; + shl.b32 %r7608, %r7600, 7; + .loc 1 608 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:608:42 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r699, %r7561, 1; + .loc 2 41 22 // standard.py:41:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r7609, %r2358, 63; + .loc 2 41 28 // standard.py:41:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shr.s32 %r7610, %r7609, 31; + shr.u32 %r7611, %r7610, 26; + add.s32 %r7612, %r7609, %r7611; + shr.s32 %r7613, %r7612, 6; + .loc 1 608 98 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:608:98 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + max.s32 %r7614, %r7613, 1; + .loc 1 608 61 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:608:61 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + min.s32 %r7615, %r699, %r7614; +$L__tmp7: + .loc 1 253 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:253:29 + or.b32 %r7616, %r21, %r7565; + or.b32 %r7617, %r22, %r7565; + .loc 1 290 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:45 + or.b32 %r7618, %r692, 32; + or.b32 %r7619, %r692, 33; + or.b32 %r7620, %r692, 40; + or.b32 %r7621, %r692, 41; + or.b32 %r7622, %r692, 48; + or.b32 %r7623, %r692, 49; + or.b32 %r7624, %r692, 56; + or.b32 %r7625, %r692, 57; + .loc 1 290 32 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:32 + or.b32 %r765, %r689, %r7625; + or.b32 %r764, %r689, %r7624; + or.b32 %r763, %r689, %r7623; + or.b32 %r762, %r689, %r7622; + or.b32 %r761, %r689, %r7621; + or.b32 %r760, %r689, %r7620; + or.b32 %r759, %r689, %r7619; + or.b32 %r758, %r689, %r7618; +$L__tmp8: + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r725, %r7617, %r2359; + rem.s32 %r980, %r7616, %r2359; + .loc 1 701 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:701:35 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mad.wide.u32 %rd562, %r6, 8, %rd207; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.gt.s32 %p462, %r699, 0; + .loc 1 701 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:701:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + mov.u64 %rd561, 0x0; + @%p462 ld.global.b64 { %rd561 }, [ %rd562 + 0 ]; + // end inline asm + .loc 1 709 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:709:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ge.s32 %p465, %r980, %r2366; + setp.ge.s32 %p466, %r725, %r2366; + .loc 1 710 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:710:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r7626, %r980, %r2366; + rem.s32 %r7627, %r725, %r2366; + .loc 1 712 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:712:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.eq.b32 %p467, %r7626, 0; + setp.eq.b32 %p468, %r7627, 0; + .loc 1 718 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:718:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + min.s32 %r7628, %r2366, 0; + selp.b32 %r7629, 0, %r7628, %p467; + add.s32 %r7630, %r7629, %r7626; + selp.b32 %r7631, 0, %r7628, %p468; + add.s32 %r7632, %r7631, %r7627; + .loc 1 719 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:719:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.s64.s32 %rd593, %r7630; + cvt.s64.s32 %rd594, %r7632; + .loc 1 720 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:720:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.gt.s64 %p469, %rd561, %rd593; + setp.gt.s64 %p470, %rd561, %rd594; + .loc 1 721 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:721:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p9, %p465, %p469; + and.pred %p7, %p466, %p470; +$L__tmp9: + .loc 1 306 41 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:306:41 + add.s64 %rd563, %rd206, %rd591; + .loc 1 307 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:307:34 + // begin inline asm + mov.u32 %r7562, 0x0; + @%p460 ld.global.b32 { %r7562 }, [ %rd563 + 0 ]; + // end inline asm + .loc 1 307 47 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:307:47 + shl.b32 %r726, %r7562, 7; + .loc 1 308 64 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:308:64 + add.s64 %rd564, %rd205, %rd592; + .loc 1 308 46 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:308:46 + // begin inline asm + mov.u32 %r7563, 0x0; + @%p460 ld.global.b32 { %r7563 }, [ %rd564 + 0 ]; + // end inline asm + .loc 1 310 36 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:310:36 + or.b32 %r728, %r726, %r692; + or.b32 %r729, %r726, %r7590; + or.b32 %r730, %r726, %r7591; + or.b32 %r731, %r726, %r7592; + or.b32 %r732, %r726, %r7593; + or.b32 %r733, %r726, %r7594; + or.b32 %r734, %r726, %r7595; + or.b32 %r735, %r726, %r7596; + or.b32 %r736, %r726, %r7618; + or.b32 %r737, %r726, %r7619; + or.b32 %r738, %r726, %r7620; + or.b32 %r739, %r726, %r7621; + or.b32 %r740, %r726, %r7622; + or.b32 %r741, %r726, %r7623; + or.b32 %r742, %r726, %r7624; + or.b32 %r743, %r726, %r7625; + or.b32 %r7633, %r726, %r13; + or.b32 %r7634, %r726, %r14; + or.b32 %r7635, %r726, %r15; + or.b32 %r7636, %r726, %r16; +$L__tmp10: + .loc 1 601 37 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:601:37 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r7637, %r7633, 12; + shl.b32 %r7638, %r7634, 12; + shl.b32 %r7639, %r7635, 12; + shl.b32 %r7640, %r7636, 12; + .loc 1 602 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:602:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r7641, %r7633, 7; + shl.b32 %r7642, %r7634, 7; + shl.b32 %r7643, %r7635, 7; + shl.b32 %r7644, %r7636, 7; + .loc 1 608 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:608:42 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r744, %r7563, 1; + .loc 1 608 61 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:608:61 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + min.s32 %r7645, %r744, %r7614; +$L__tmp11: + .loc 1 676 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:676:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + cvt.s64.s32 %rd83, %r7601; + cvt.s64.s32 %rd84, %r7602; + cvt.s64.s32 %rd85, %r7603; + cvt.s64.s32 %rd86, %r7604; + cvt.s64.s32 %rd87, %r7605; + cvt.s64.s32 %rd88, %r7606; + cvt.s64.s32 %rd89, %r7607; + cvt.s64.s32 %rd90, %r7608; + setp.lt.s32 %p471, %r7597, %r2358; + setp.lt.s32 %p472, %r7598, %r2358; + setp.lt.s32 %p473, %r7599, %r2358; + setp.lt.s32 %p474, %r7600, %r2358; + shl.b32 %r7646, %r7580, 10; + or.b32 %r745, %r7583, %r7646; + add.s32 %r11058, %r7585, %r745; + selp.b32 %r7647, 16, 0, %p471; + selp.b32 %r7729, %r7647, 0, %p462; + add.s32 %r11060, %r11058, 2048; + selp.b32 %r7648, 16, 0, %p472; + selp.b32 %r7731, %r7648, 0, %p462; + add.s32 %r11062, %r11058, 4096; + selp.b32 %r7649, 16, 0, %p473; + selp.b32 %r7733, %r7649, 0, %p462; + add.s32 %r11064, %r11058, 6144; + selp.b32 %r7650, 16, 0, %p474; + selp.b32 %r7735, %r7650, 0, %p462; + setp.lt.s32 %p475, %r693, %r2358; + setp.lt.s32 %p476, %r694, %r2358; + setp.lt.s32 %p477, %r754, %r2358; + setp.lt.s32 %p478, %r755, %r2358; + setp.lt.s32 %p479, %r756, %r2358; + setp.lt.s32 %p480, %r757, %r2358; + setp.lt.s32 %p481, %r758, %r2358; + setp.lt.s32 %p482, %r759, %r2358; + setp.lt.s32 %p483, %r760, %r2358; + setp.lt.s32 %p484, %r761, %r2358; + setp.lt.s32 %p485, %r762, %r2358; + setp.lt.s32 %p486, %r763, %r2358; + setp.lt.s32 %p487, %r764, %r2358; + setp.lt.s32 %p488, %r765, %r2358; + cvt.s64.s32 %rd91, %r754; + cvt.s64.s32 %rd92, %r755; + cvt.s64.s32 %rd93, %r756; + cvt.s64.s32 %rd94, %r757; + cvt.s64.s32 %rd95, %r758; + cvt.s64.s32 %rd96, %r759; + cvt.s64.s32 %rd97, %r760; + cvt.s64.s32 %rd98, %r761; + cvt.s64.s32 %rd99, %r762; + cvt.s64.s32 %rd100, %r763; + cvt.s64.s32 %rd101, %r764; + cvt.s64.s32 %rd102, %r765; + and.b32 %r766, %r10, 252; + shl.b32 %r767, %r691, 3; + add.s32 %r7651, %r7585, %r767; + add.s32 %r11066, %r7651, 98304; + add.s32 %r11068, %r7651, 98308; + add.s32 %r11070, %r7651, 98336; + selp.b32 %r7652, 4, 0, %p475; + selp.b32 %r7741, %r7652, 0, %p462; + add.s32 %r11072, %r7651, 98340; + selp.b32 %r7653, 4, 0, %p476; + selp.b32 %r7743, %r7653, 0, %p462; + add.s32 %r11074, %r7651, 98368; + selp.b32 %r7654, 4, 0, %p477; + selp.b32 %r7745, %r7654, 0, %p462; + add.s32 %r11076, %r7651, 98372; + selp.b32 %r7655, 4, 0, %p478; + selp.b32 %r7747, %r7655, 0, %p462; + add.s32 %r11078, %r7651, 98400; + selp.b32 %r7656, 4, 0, %p479; + selp.b32 %r7749, %r7656, 0, %p462; + add.s32 %r11080, %r7651, 98404; + selp.b32 %r7657, 4, 0, %p480; + selp.b32 %r7751, %r7657, 0, %p462; + add.s32 %r11082, %r7651, 98432; + selp.b32 %r7658, 4, 0, %p481; + selp.b32 %r7753, %r7658, 0, %p462; + add.s32 %r11084, %r7651, 98436; + selp.b32 %r7659, 4, 0, %p482; + selp.b32 %r7755, %r7659, 0, %p462; + add.s32 %r11086, %r7651, 98464; + selp.b32 %r7660, 4, 0, %p483; + selp.b32 %r7757, %r7660, 0, %p462; + add.s32 %r11088, %r7651, 98468; + selp.b32 %r7661, 4, 0, %p484; + selp.b32 %r7759, %r7661, 0, %p462; + add.s32 %r11090, %r7651, 98496; + selp.b32 %r7662, 4, 0, %p485; + selp.b32 %r7761, %r7662, 0, %p462; + add.s32 %r11092, %r7651, 98500; + selp.b32 %r7663, 4, 0, %p486; + selp.b32 %r7763, %r7663, 0, %p462; + add.s32 %r11094, %r7651, 98528; + selp.b32 %r7664, 4, 0, %p487; + selp.b32 %r7765, %r7664, 0, %p462; + add.s32 %r11096, %r7651, 98532; + selp.b32 %r7665, 4, 0, %p488; + selp.b32 %r7767, %r7665, 0, %p462; + add.s32 %r11098, %r11058, 49152; + add.s32 %r11100, %r11058, 51200; + add.s32 %r11102, %r11058, 53248; + add.s32 %r11104, %r11058, 55296; + add.s32 %r11106, %r7651, 98816; + add.s32 %r11108, %r7651, 98820; + add.s32 %r11110, %r7651, 98848; + add.s32 %r11112, %r7651, 98852; + add.s32 %r11114, %r7651, 98880; + add.s32 %r11116, %r7651, 98884; + add.s32 %r11118, %r7651, 98912; + add.s32 %r11120, %r7651, 98916; + add.s32 %r11122, %r7651, 98944; + add.s32 %r11124, %r7651, 98948; + add.s32 %r11126, %r7651, 98976; + add.s32 %r11128, %r7651, 98980; + add.s32 %r11130, %r7651, 99008; + add.s32 %r11132, %r7651, 99012; + add.s32 %r11134, %r7651, 99040; + add.s32 %r11136, %r7651, 99044; + setp.gt.s32 %p489, %r7615, 1; +$L__tmp12: + .loc 1 290 32 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:290:32 + or.b32 %r819, %r689, %r7590; + or.b32 %r818, %r689, %r692; + setp.lt.s32 %p490, %r818, %r2358; + setp.lt.s32 %p491, %r819, %r2358; + cvt.s64.s32 %rd103, %r818; + cvt.s64.s32 %rd104, %r819; + selp.b32 %r7666, 4, 0, %p490; + selp.b32 %r7737, %r7666, 0, %p462; + selp.b32 %r7667, 4, 0, %p491; + selp.b32 %r7739, %r7667, 0, %p462; + or.b32 %r823, %r819, 64; + or.b32 %r822, %r818, 64; + or.b32 %r824, %r693, 64; + or.b32 %r825, %r694, 64; + or.b32 %r826, %r754, 64; + or.b32 %r827, %r755, 64; + or.b32 %r828, %r756, 64; + or.b32 %r829, %r757, 64; + or.b32 %r830, %r758, 64; + or.b32 %r831, %r759, 64; + or.b32 %r832, %r760, 64; + or.b32 %r833, %r761, 64; + or.b32 %r834, %r762, 64; + or.b32 %r835, %r763, 64; + or.b32 %r836, %r764, 64; + or.b32 %r837, %r765, 64; + or.b32 %r838, %r7597, 64; + or.b32 %r839, %r7598, 64; + or.b32 %r840, %r7599, 64; + or.b32 %r841, %r7600, 64; + setp.lt.s32 %p492, %r838, %r2358; + setp.lt.s32 %p493, %r839, %r2358; + setp.lt.s32 %p494, %r840, %r2358; + setp.lt.s32 %p495, %r841, %r2358; + add.s32 %r11138, %r11058, 16384; + selp.b32 %r7668, 16, 0, %p492; + selp.b32 %r7809, %r7668, 0, %p489; + add.s32 %r11140, %r11058, 18432; + selp.b32 %r7669, 16, 0, %p493; + selp.b32 %r7811, %r7669, 0, %p489; + add.s32 %r11142, %r11058, 20480; + selp.b32 %r7670, 16, 0, %p494; + selp.b32 %r7813, %r7670, 0, %p489; + add.s32 %r11144, %r11058, 22528; + selp.b32 %r7671, 16, 0, %p495; + selp.b32 %r7815, %r7671, 0, %p489; + setp.lt.s32 %p496, %r822, %r2358; + setp.lt.s32 %p497, %r823, %r2358; + setp.lt.s32 %p498, %r824, %r2358; + setp.lt.s32 %p499, %r825, %r2358; + setp.lt.s32 %p500, %r826, %r2358; + setp.lt.s32 %p501, %r827, %r2358; + setp.lt.s32 %p502, %r828, %r2358; + setp.lt.s32 %p503, %r829, %r2358; + setp.lt.s32 %p504, %r830, %r2358; + setp.lt.s32 %p505, %r831, %r2358; + setp.lt.s32 %p506, %r832, %r2358; + setp.lt.s32 %p507, %r833, %r2358; + setp.lt.s32 %p508, %r834, %r2358; + setp.lt.s32 %p509, %r835, %r2358; + setp.lt.s32 %p510, %r836, %r2358; + setp.lt.s32 %p511, %r837, %r2358; + cvt.s64.s32 %rd105, %r822; + cvt.s64.s32 %rd106, %r823; + add.s32 %r11146, %r7651, 98560; + selp.b32 %r7672, 4, 0, %p496; + selp.b32 %r7817, %r7672, 0, %p489; + add.s32 %r11148, %r7651, 98564; + selp.b32 %r7673, 4, 0, %p497; + selp.b32 %r7819, %r7673, 0, %p489; + add.s32 %r11150, %r7651, 98592; + selp.b32 %r7674, 4, 0, %p498; + selp.b32 %r7821, %r7674, 0, %p489; + add.s32 %r11152, %r7651, 98596; + selp.b32 %r7675, 4, 0, %p499; + selp.b32 %r7823, %r7675, 0, %p489; + add.s32 %r11154, %r7651, 98624; + selp.b32 %r7676, 4, 0, %p500; + selp.b32 %r7825, %r7676, 0, %p489; + add.s32 %r11156, %r7651, 98628; + selp.b32 %r7677, 4, 0, %p501; + selp.b32 %r7827, %r7677, 0, %p489; + add.s32 %r11158, %r7651, 98656; + selp.b32 %r7678, 4, 0, %p502; + selp.b32 %r7829, %r7678, 0, %p489; + add.s32 %r11160, %r7651, 98660; + selp.b32 %r7679, 4, 0, %p503; + selp.b32 %r7831, %r7679, 0, %p489; + add.s32 %r11162, %r7651, 98688; + selp.b32 %r7680, 4, 0, %p504; + selp.b32 %r7833, %r7680, 0, %p489; + add.s32 %r11164, %r7651, 98692; + selp.b32 %r7681, 4, 0, %p505; + selp.b32 %r7835, %r7681, 0, %p489; + add.s32 %r11166, %r7651, 98720; + selp.b32 %r7682, 4, 0, %p506; + selp.b32 %r7837, %r7682, 0, %p489; + add.s32 %r11168, %r7651, 98724; + selp.b32 %r7683, 4, 0, %p507; + selp.b32 %r7839, %r7683, 0, %p489; + add.s32 %r11170, %r7651, 98752; + selp.b32 %r7684, 4, 0, %p508; + selp.b32 %r7841, %r7684, 0, %p489; + add.s32 %r11172, %r7651, 98756; + selp.b32 %r7685, 4, 0, %p509; + selp.b32 %r7843, %r7685, 0, %p489; + add.s32 %r11174, %r7651, 98784; + selp.b32 %r7686, 4, 0, %p510; + selp.b32 %r7845, %r7686, 0, %p489; + add.s32 %r11176, %r7651, 98788; + selp.b32 %r7687, 4, 0, %p511; + selp.b32 %r7847, %r7687, 0, %p489; + add.s32 %r11178, %r11058, 65536; + add.s32 %r11180, %r11058, 67584; + add.s32 %r11182, %r11058, 69632; + add.s32 %r11184, %r11058, 71680; + add.s32 %r11186, %r7651, 99072; + add.s32 %r11188, %r7651, 99076; + add.s32 %r11190, %r7651, 99104; + add.s32 %r11192, %r7651, 99108; + add.s32 %r11194, %r7651, 99136; + add.s32 %r11196, %r7651, 99140; + add.s32 %r11198, %r7651, 99168; + add.s32 %r11200, %r7651, 99172; + add.s32 %r11202, %r7651, 99200; + add.s32 %r11204, %r7651, 99204; + add.s32 %r11206, %r7651, 99232; + add.s32 %r11208, %r7651, 99236; + add.s32 %r11210, %r7651, 99264; + add.s32 %r11212, %r7651, 99268; + add.s32 %r11214, %r7651, 99296; + add.s32 %r11216, %r7651, 99300; + add.s32 %r902, %r7615, -2; + add.s32 %r903, %r7615, -1; + cvt.s64.s32 %rd107, %r7637; + cvt.s64.s32 %rd108, %r7638; + cvt.s64.s32 %rd109, %r7639; + cvt.s64.s32 %rd110, %r7640; + cvt.s64.s32 %rd111, %r7641; + cvt.s64.s32 %rd112, %r7642; + cvt.s64.s32 %rd113, %r7643; + cvt.s64.s32 %rd114, %r7644; + setp.gt.s32 %p512, %r744, 0; + setp.lt.s32 %p513, %r7633, %r2358; + setp.lt.s32 %p514, %r7634, %r2358; + setp.lt.s32 %p515, %r7635, %r2358; + setp.lt.s32 %p516, %r7636, %r2358; + selp.b32 %r7688, 16, 0, %p513; + selp.b32 %r11059, %r7688, 0, %p512; + selp.b32 %r7689, 16, 0, %p514; + selp.b32 %r11061, %r7689, 0, %p512; + selp.b32 %r7690, 16, 0, %p515; + selp.b32 %r11063, %r7690, 0, %p512; + selp.b32 %r7691, 16, 0, %p516; + selp.b32 %r11065, %r7691, 0, %p512; + setp.lt.s32 %p517, %r728, %r2358; + setp.lt.s32 %p518, %r729, %r2358; + setp.lt.s32 %p519, %r730, %r2358; + setp.lt.s32 %p520, %r731, %r2358; + setp.lt.s32 %p521, %r732, %r2358; + setp.lt.s32 %p522, %r733, %r2358; + setp.lt.s32 %p523, %r734, %r2358; + setp.lt.s32 %p524, %r735, %r2358; + setp.lt.s32 %p525, %r736, %r2358; + setp.lt.s32 %p526, %r737, %r2358; + setp.lt.s32 %p527, %r738, %r2358; + setp.lt.s32 %p528, %r739, %r2358; + setp.lt.s32 %p529, %r740, %r2358; + setp.lt.s32 %p530, %r741, %r2358; + setp.lt.s32 %p531, %r742, %r2358; + setp.lt.s32 %p532, %r743, %r2358; + cvt.s64.s32 %rd115, %r728; + cvt.s64.s32 %rd116, %r732; + cvt.s64.s32 %rd117, %r733; + cvt.s64.s32 %rd118, %r734; + cvt.s64.s32 %rd119, %r735; + cvt.s64.s32 %rd120, %r736; + cvt.s64.s32 %rd121, %r737; + cvt.s64.s32 %rd122, %r738; + cvt.s64.s32 %rd123, %r739; + cvt.s64.s32 %rd124, %r740; + cvt.s64.s32 %rd125, %r741; + cvt.s64.s32 %rd126, %r742; + cvt.s64.s32 %rd127, %r743; + selp.b32 %r7692, 4, 0, %p517; + selp.b32 %r11067, %r7692, 0, %p512; + selp.b32 %r7693, 4, 0, %p518; + selp.b32 %r11069, %r7693, 0, %p512; + selp.b32 %r7694, 4, 0, %p519; + selp.b32 %r11071, %r7694, 0, %p512; + selp.b32 %r7695, 4, 0, %p520; + selp.b32 %r11073, %r7695, 0, %p512; + selp.b32 %r7696, 4, 0, %p521; + selp.b32 %r11075, %r7696, 0, %p512; + selp.b32 %r7697, 4, 0, %p522; + selp.b32 %r11077, %r7697, 0, %p512; + selp.b32 %r7698, 4, 0, %p523; + selp.b32 %r11079, %r7698, 0, %p512; + selp.b32 %r7699, 4, 0, %p524; + selp.b32 %r11081, %r7699, 0, %p512; + selp.b32 %r7700, 4, 0, %p525; + selp.b32 %r11083, %r7700, 0, %p512; + selp.b32 %r7701, 4, 0, %p526; + selp.b32 %r11085, %r7701, 0, %p512; + selp.b32 %r7702, 4, 0, %p527; + selp.b32 %r11087, %r7702, 0, %p512; + selp.b32 %r7703, 4, 0, %p528; + selp.b32 %r11089, %r7703, 0, %p512; + selp.b32 %r7704, 4, 0, %p529; + selp.b32 %r11091, %r7704, 0, %p512; + selp.b32 %r7705, 4, 0, %p530; + selp.b32 %r11093, %r7705, 0, %p512; + selp.b32 %r7706, 4, 0, %p531; + selp.b32 %r11095, %r7706, 0, %p512; + selp.b32 %r7707, 4, 0, %p532; + selp.b32 %r11097, %r7707, 0, %p512; + setp.gt.s32 %p533, %r7645, 1; + or.b32 %r924, %r728, 64; + or.b32 %r925, %r729, 64; + or.b32 %r926, %r730, 64; + or.b32 %r927, %r731, 64; + or.b32 %r928, %r732, 64; + or.b32 %r929, %r733, 64; + or.b32 %r930, %r734, 64; + or.b32 %r931, %r735, 64; + or.b32 %r932, %r736, 64; + or.b32 %r933, %r737, 64; + or.b32 %r934, %r738, 64; + or.b32 %r935, %r739, 64; + or.b32 %r936, %r740, 64; + or.b32 %r937, %r741, 64; + or.b32 %r938, %r742, 64; + or.b32 %r939, %r743, 64; + or.b32 %r940, %r7633, 64; + or.b32 %r941, %r7634, 64; + or.b32 %r942, %r7635, 64; + or.b32 %r943, %r7636, 64; + setp.lt.s32 %p534, %r940, %r2358; + setp.lt.s32 %p535, %r941, %r2358; + setp.lt.s32 %p536, %r942, %r2358; + setp.lt.s32 %p537, %r943, %r2358; + selp.b32 %r7708, 16, 0, %p534; + selp.b32 %r11139, %r7708, 0, %p533; + selp.b32 %r7709, 16, 0, %p535; + selp.b32 %r11141, %r7709, 0, %p533; + selp.b32 %r7710, 16, 0, %p536; + selp.b32 %r11143, %r7710, 0, %p533; + selp.b32 %r7711, 16, 0, %p537; + selp.b32 %r11145, %r7711, 0, %p533; + setp.lt.s32 %p538, %r924, %r2358; + setp.lt.s32 %p539, %r925, %r2358; + setp.lt.s32 %p540, %r926, %r2358; + setp.lt.s32 %p541, %r927, %r2358; + setp.lt.s32 %p542, %r928, %r2358; + setp.lt.s32 %p543, %r929, %r2358; + setp.lt.s32 %p544, %r930, %r2358; + setp.lt.s32 %p545, %r931, %r2358; + setp.lt.s32 %p546, %r932, %r2358; + setp.lt.s32 %p547, %r933, %r2358; + setp.lt.s32 %p548, %r934, %r2358; + setp.lt.s32 %p549, %r935, %r2358; + setp.lt.s32 %p550, %r936, %r2358; + setp.lt.s32 %p551, %r937, %r2358; + setp.lt.s32 %p552, %r938, %r2358; + setp.lt.s32 %p553, %r939, %r2358; + selp.b32 %r7712, 4, 0, %p538; + selp.b32 %r11147, %r7712, 0, %p533; + selp.b32 %r7713, 4, 0, %p539; + selp.b32 %r11149, %r7713, 0, %p533; + selp.b32 %r7714, 4, 0, %p540; + selp.b32 %r11151, %r7714, 0, %p533; + selp.b32 %r7715, 4, 0, %p541; + selp.b32 %r11153, %r7715, 0, %p533; + selp.b32 %r7716, 4, 0, %p542; + selp.b32 %r11155, %r7716, 0, %p533; + selp.b32 %r7717, 4, 0, %p543; + selp.b32 %r11157, %r7717, 0, %p533; + selp.b32 %r7718, 4, 0, %p544; + selp.b32 %r11159, %r7718, 0, %p533; + selp.b32 %r7719, 4, 0, %p545; + selp.b32 %r11161, %r7719, 0, %p533; + selp.b32 %r7720, 4, 0, %p546; + selp.b32 %r11163, %r7720, 0, %p533; + selp.b32 %r7721, 4, 0, %p547; + selp.b32 %r11165, %r7721, 0, %p533; + selp.b32 %r7722, 4, 0, %p548; + selp.b32 %r11167, %r7722, 0, %p533; + selp.b32 %r7723, 4, 0, %p549; + selp.b32 %r11169, %r7723, 0, %p533; + selp.b32 %r7724, 4, 0, %p550; + selp.b32 %r11171, %r7724, 0, %p533; + selp.b32 %r7725, 4, 0, %p551; + selp.b32 %r11173, %r7725, 0, %p533; + selp.b32 %r7726, 4, 0, %p552; + selp.b32 %r11175, %r7726, 0, %p533; + selp.b32 %r7727, 4, 0, %p553; + selp.b32 %r11177, %r7727, 0, %p533; + add.s32 %r964, %r7645, -2; + add.s32 %r965, %r7645, -1; + .loc 1 262 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:262:30 + max.s32 %r966, %r7615, 1; + max.s32 %r967, %r7645, 1; + mul.wide.u32 %rd128, %r7, 4; + mov.b32 %r14988, 0f00000000; + mov.b64 %rd1193, 0; + shl.b64 %rd677, %rd83, 1; + shl.b64 %rd679, %rd84, 1; + shl.b64 %rd681, %rd85, 1; + shl.b64 %rd683, %rd86, 1; + shl.b64 %rd686, %rd87, 1; + shl.b64 %rd688, %rd88, 1; + shl.b64 %rd690, %rd89, 1; + shl.b64 %rd692, %rd90, 1; + shl.b64 %rd694, %rd103, 2; + shl.b64 %rd695, %rd104, 2; + shl.b64 %rd700, %rd91, 2; + shl.b64 %rd701, %rd92, 2; + shl.b64 %rd702, %rd93, 2; + shl.b64 %rd703, %rd94, 2; + shl.b64 %rd704, %rd95, 2; + shl.b64 %rd705, %rd96, 2; + shl.b64 %rd706, %rd97, 2; + shl.b64 %rd707, %rd98, 2; + shl.b64 %rd708, %rd99, 2; + shl.b64 %rd709, %rd100, 2; + shl.b64 %rd710, %rd101, 2; + shl.b64 %rd711, %rd102, 2; + shl.b64 %rd713, %rd105, 2; + shl.b64 %rd714, %rd106, 2; + shl.b64 %rd956, %rd107, 1; + shl.b64 %rd958, %rd108, 1; + shl.b64 %rd960, %rd109, 1; + shl.b64 %rd962, %rd110, 1; + shl.b64 %rd965, %rd111, 1; + shl.b64 %rd967, %rd112, 1; + shl.b64 %rd969, %rd113, 1; + shl.b64 %rd971, %rd114, 1; + shl.b64 %rd973, %rd115, 2; + shl.b64 %rd978, %rd116, 2; + shl.b64 %rd979, %rd117, 2; + shl.b64 %rd980, %rd118, 2; + shl.b64 %rd981, %rd119, 2; + shl.b64 %rd982, %rd120, 2; + shl.b64 %rd983, %rd121, 2; + shl.b64 %rd984, %rd122, 2; + shl.b64 %rd985, %rd123, 2; + shl.b64 %rd986, %rd124, 2; + shl.b64 %rd987, %rd125, 2; + shl.b64 %rd988, %rd126, 2; + shl.b64 %rd989, %rd127, 2; + mov.b32 %r14989, %r14988; + mov.b32 %r14990, %r14988; + mov.b32 %r14991, %r14988; + mov.b32 %r14992, %r14988; + mov.b32 %r14993, %r14988; + mov.b32 %r14994, %r14988; + mov.b32 %r14995, %r14988; + mov.b32 %r14996, %r14988; + mov.b32 %r14997, %r14988; + mov.b32 %r14998, %r14988; + mov.b32 %r14999, %r14988; + mov.b32 %r15000, %r14988; + mov.b32 %r15001, %r14988; + mov.b32 %r15002, %r14988; + mov.b32 %r15003, %r14988; + mov.b32 %r15004, %r14988; + mov.b32 %r15005, %r14988; + mov.b32 %r15006, %r14988; + mov.b32 %r15007, %r14988; + mov.b32 %r15008, %r14988; + mov.b32 %r15009, %r14988; + mov.b32 %r15010, %r14988; + mov.b32 %r15011, %r14988; + mov.b32 %r15012, %r14988; + mov.b32 %r15013, %r14988; + mov.b32 %r15014, %r14988; + mov.b32 %r15015, %r14988; + mov.b32 %r15016, %r14988; + mov.b32 %r15017, %r14988; + mov.b32 %r15018, %r14988; + mov.b32 %r15019, %r14988; + mov.b32 %r15020, %r14988; + mov.b32 %r15021, %r14988; + mov.b32 %r15022, %r14988; + mov.b32 %r15023, %r14988; + mov.b32 %r15024, %r14988; + mov.b32 %r15025, %r14988; + mov.b32 %r15026, %r14988; + mov.b32 %r15027, %r14988; + mov.b32 %r15028, %r14988; + mov.b32 %r15029, %r14988; + mov.b32 %r15030, %r14988; + mov.b32 %r15031, %r14988; + mov.b32 %r15032, %r14988; + mov.b32 %r15033, %r14988; + mov.b32 %r15034, %r14988; + mov.b32 %r15035, %r14988; + mov.b32 %r15036, %r14988; + mov.b32 %r15037, %r14988; + mov.b32 %r15038, %r14988; + mov.b32 %r15039, %r14988; + mov.b32 %r15040, %r14988; + mov.b32 %r15041, %r14988; + mov.b32 %r15042, %r14988; + mov.b32 %r15043, %r14988; + mov.b32 %r15044, %r14988; + mov.b32 %r15045, %r14988; + mov.b32 %r15046, %r14988; + mov.b32 %r15047, %r14988; + mov.b32 %r15048, %r14988; + mov.b32 %r15049, %r14988; + mov.b32 %r15050, %r14988; + mov.b32 %r15051, %r14988; + mov.b32 %r14924, %r14988; + mov.b32 %r14925, %r14988; + mov.b32 %r14926, %r14988; + mov.b32 %r14927, %r14988; + mov.b32 %r14928, %r14988; + mov.b32 %r14929, %r14988; + mov.b32 %r14930, %r14988; + mov.b32 %r14931, %r14988; + mov.b32 %r14932, %r14988; + mov.b32 %r14933, %r14988; + mov.b32 %r14934, %r14988; + mov.b32 %r14935, %r14988; + mov.b32 %r14936, %r14988; + mov.b32 %r14937, %r14988; + mov.b32 %r14938, %r14988; + mov.b32 %r14939, %r14988; + mov.b32 %r14940, %r14988; + mov.b32 %r14941, %r14988; + mov.b32 %r14942, %r14988; + mov.b32 %r14943, %r14988; + mov.b32 %r14944, %r14988; + mov.b32 %r14945, %r14988; + mov.b32 %r14946, %r14988; + mov.b32 %r14947, %r14988; + mov.b32 %r14948, %r14988; + mov.b32 %r14949, %r14988; + mov.b32 %r14950, %r14988; + mov.b32 %r14951, %r14988; + mov.b32 %r14952, %r14988; + mov.b32 %r14953, %r14988; + mov.b32 %r14954, %r14988; + mov.b32 %r14955, %r14988; + mov.b32 %r14956, %r14988; + mov.b32 %r14957, %r14988; + mov.b32 %r14958, %r14988; + mov.b32 %r14959, %r14988; + mov.b32 %r14960, %r14988; + mov.b32 %r14961, %r14988; + mov.b32 %r14962, %r14988; + mov.b32 %r14963, %r14988; + mov.b32 %r14964, %r14988; + mov.b32 %r14965, %r14988; + mov.b32 %r14966, %r14988; + mov.b32 %r14967, %r14988; + mov.b32 %r14968, %r14988; + mov.b32 %r14969, %r14988; + mov.b32 %r14970, %r14988; + mov.b32 %r14971, %r14988; + mov.b32 %r14972, %r14988; + mov.b32 %r14973, %r14988; + mov.b32 %r14974, %r14988; + mov.b32 %r14975, %r14988; + mov.b32 %r14976, %r14988; + mov.b32 %r14977, %r14988; + mov.b32 %r14978, %r14988; + mov.b32 %r14979, %r14988; + mov.b32 %r14980, %r14988; + mov.b32 %r14981, %r14988; + mov.b32 %r14982, %r14988; + mov.b32 %r14983, %r14988; + mov.b32 %r14984, %r14988; + mov.b32 %r14985, %r14988; + mov.b32 %r14986, %r14988; + mov.b32 %r14987, %r14988; + bra.uni $L__BB0_9; +$L__BB0_15: // %._crit_edge1794 + // in Loop: Header=BB0_9 Depth=1 +$L__tmp13: + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + // wait for regs: %r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987,%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp14: + .loc 1 262 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:262:30 + add.s64 %rd1193, %rd1193, 1; + setp.ne.b64 %p1162, %rd1193, 4; + @%p1162 bra $L__BB0_9; + bra.uni $L__BB0_16; +$L__BB0_9: // =>This Loop Header: Depth=1 + // Child Loop BB0_11 Depth 2 + // Child Loop BB0_14 Depth 2 + .loc 1 0 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0:30 + setp.eq.b32 %p978, %r766, 0; +$L__tmp15: + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p618, %r699, 1; +$L__tmp16: + .loc 1 263 51 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:263:51 + add.s64 %rd675, %rd1193, %rd128; + .loc 1 266 44 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:266:44 + cvt.u32.u64 %r7888, %rd675; + shl.b32 %r7889, %r7888, 7; + add.s32 %r7890, %r7889, %r686; + .loc 1 267 46 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:267:46 + mad.lo.s32 %r7891, %r3, %r7888, %r687; + .loc 1 269 50 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:269:50 + add.s32 %r7892, %r688, %r7888; + mul.lo.s32 %r7893, %r7892, %r2358; + .loc 1 271 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:271:21 + mad.wide.s32 %rd140, %r7890, 2, %rd194; + .loc 1 272 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:272:23 + mad.wide.s32 %rd141, %r7891, 2, %rd197; + .loc 1 275 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:275:25 + mul.wide.s32 %rd676, %r7893, 4; + add.s64 %rd142, %rd195, %rd676; + .loc 1 276 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:276:29 + add.s64 %rd143, %rd196, %rd676; +$L__tmp17: + .loc 1 601 18 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:601:18 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd678, %rd140, %rd677; + add.s64 %rd680, %rd140, %rd679; + add.s64 %rd682, %rd140, %rd681; + add.s64 %rd684, %rd140, %rd683; + .loc 1 601 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:601:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b64 %rd685, %rd79, 1; + add.s64 %rd595, %rd678, %rd685; + add.s64 %rd596, %rd680, %rd685; + add.s64 %rd597, %rd682, %rd685; + add.s64 %rd598, %rd684, %rd685; + .loc 1 602 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:602:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd687, %rd141, %rd686; + add.s64 %rd689, %rd141, %rd688; + add.s64 %rd691, %rd141, %rd690; + add.s64 %rd693, %rd141, %rd692; + .loc 1 602 51 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:602:51 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd615, %rd687, %rd685; + add.s64 %rd616, %rd689, %rd685; + add.s64 %rd617, %rd691, %rd685; + add.s64 %rd618, %rd693, %rd685; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r11058 + 0 ], [ %rd595 + 0 ], 0x10, %r7729; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11060 + 0 ], [ %rd596 + 0 ], 0x10, %r7731; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11062 + 0 ], [ %rd597 + 0 ], 0x10, %r7733; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11064 + 0 ], [ %rd598 + 0 ], 0x10, %r7735; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd599, %rd142, %rd694; + add.s64 %rd600, %rd142, %rd695; + cvt.s64.s32 %rd696, %r689; + cvt.u64.u32 %rd144, %r692; + add.s64 %rd697, %rd696, %rd144; + shl.b64 %rd698, %rd697, 2; + add.s64 %rd699, %rd142, %rd698; + add.s64 %rd601, %rd699, 32; + add.s64 %rd602, %rd699, 36; + add.s64 %rd603, %rd142, %rd700; + add.s64 %rd604, %rd142, %rd701; + add.s64 %rd605, %rd142, %rd702; + add.s64 %rd606, %rd142, %rd703; + add.s64 %rd607, %rd142, %rd704; + add.s64 %rd608, %rd142, %rd705; + add.s64 %rd609, %rd142, %rd706; + add.s64 %rd610, %rd142, %rd707; + add.s64 %rd611, %rd142, %rd708; + add.s64 %rd612, %rd142, %rd709; + add.s64 %rd613, %rd142, %rd710; + add.s64 %rd614, %rd142, %rd711; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11066 + 0 ], [ %rd599 + 0 ], 0x4, %r7737; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11068 + 0 ], [ %rd600 + 0 ], 0x4, %r7739; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11070 + 0 ], [ %rd601 + 0 ], 0x4, %r7741; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11072 + 0 ], [ %rd602 + 0 ], 0x4, %r7743; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11074 + 0 ], [ %rd603 + 0 ], 0x4, %r7745; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11076 + 0 ], [ %rd604 + 0 ], 0x4, %r7747; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11078 + 0 ], [ %rd605 + 0 ], 0x4, %r7749; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11080 + 0 ], [ %rd606 + 0 ], 0x4, %r7751; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11082 + 0 ], [ %rd607 + 0 ], 0x4, %r7753; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11084 + 0 ], [ %rd608 + 0 ], 0x4, %r7755; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11086 + 0 ], [ %rd609 + 0 ], 0x4, %r7757; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11088 + 0 ], [ %rd610 + 0 ], 0x4, %r7759; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11090 + 0 ], [ %rd611 + 0 ], 0x4, %r7761; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11092 + 0 ], [ %rd612 + 0 ], 0x4, %r7763; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11094 + 0 ], [ %rd613 + 0 ], 0x4, %r7765; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11096 + 0 ], [ %rd614 + 0 ], 0x4, %r7767; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r11098 + 0 ], [ %rd615 + 0 ], 0x10, %r7729; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11100 + 0 ], [ %rd616 + 0 ], 0x10, %r7731; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11102 + 0 ], [ %rd617 + 0 ], 0x10, %r7733; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11104 + 0 ], [ %rd618 + 0 ], 0x10, %r7735; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd619, %rd143, %rd694; + add.s64 %rd620, %rd143, %rd695; + add.s64 %rd712, %rd143, %rd698; + add.s64 %rd621, %rd712, 32; + add.s64 %rd622, %rd712, 36; + add.s64 %rd623, %rd143, %rd700; + add.s64 %rd624, %rd143, %rd701; + add.s64 %rd625, %rd143, %rd702; + add.s64 %rd626, %rd143, %rd703; + add.s64 %rd627, %rd143, %rd704; + add.s64 %rd628, %rd143, %rd705; + add.s64 %rd629, %rd143, %rd706; + add.s64 %rd630, %rd143, %rd707; + add.s64 %rd631, %rd143, %rd708; + add.s64 %rd632, %rd143, %rd709; + add.s64 %rd633, %rd143, %rd710; + add.s64 %rd634, %rd143, %rd711; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11106 + 0 ], [ %rd619 + 0 ], 0x4, %r7737; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11108 + 0 ], [ %rd620 + 0 ], 0x4, %r7739; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11110 + 0 ], [ %rd621 + 0 ], 0x4, %r7741; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11112 + 0 ], [ %rd622 + 0 ], 0x4, %r7743; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11114 + 0 ], [ %rd623 + 0 ], 0x4, %r7745; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11116 + 0 ], [ %rd624 + 0 ], 0x4, %r7747; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11118 + 0 ], [ %rd625 + 0 ], 0x4, %r7749; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11120 + 0 ], [ %rd626 + 0 ], 0x4, %r7751; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11122 + 0 ], [ %rd627 + 0 ], 0x4, %r7753; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11124 + 0 ], [ %rd628 + 0 ], 0x4, %r7755; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11126 + 0 ], [ %rd629 + 0 ], 0x4, %r7757; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11128 + 0 ], [ %rd630 + 0 ], 0x4, %r7759; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11130 + 0 ], [ %rd631 + 0 ], 0x4, %r7761; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11132 + 0 ], [ %rd632 + 0 ], 0x4, %r7763; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11134 + 0 ], [ %rd633 + 0 ], 0x4, %r7765; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11136 + 0 ], [ %rd634 + 0 ], 0x4, %r7767; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:626:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd1201, %rd595, 524288; + add.s64 %rd1200, %rd596, 524288; + add.s64 %rd1199, %rd597, 524288; + add.s64 %rd1198, %rd598, 524288; + .loc 1 627 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:627:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd1197, %rd615, 16384; + add.s64 %rd1196, %rd616, 16384; + add.s64 %rd1195, %rd617, 16384; + add.s64 %rd1194, %rd618, 16384; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r11138 + 0 ], [ %rd1201 + 0 ], 0x10, %r7809; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11140 + 0 ], [ %rd1200 + 0 ], 0x10, %r7811; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11142 + 0 ], [ %rd1199 + 0 ], 0x10, %r7813; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11144 + 0 ], [ %rd1198 + 0 ], 0x10, %r7815; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd639, %rd142, %rd713; + add.s64 %rd640, %rd142, %rd714; + add.s64 %rd715, %rd698, 256; + add.s64 %rd716, %rd142, %rd715; + add.s64 %rd641, %rd716, 32; + add.s64 %rd642, %rd716, 36; + add.s64 %rd643, %rd603, 256; + add.s64 %rd644, %rd604, 256; + add.s64 %rd645, %rd605, 256; + add.s64 %rd646, %rd606, 256; + add.s64 %rd647, %rd607, 256; + add.s64 %rd648, %rd608, 256; + add.s64 %rd649, %rd609, 256; + add.s64 %rd650, %rd610, 256; + add.s64 %rd651, %rd611, 256; + add.s64 %rd652, %rd612, 256; + add.s64 %rd653, %rd613, 256; + add.s64 %rd654, %rd614, 256; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11146 + 0 ], [ %rd639 + 0 ], 0x4, %r7817; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11148 + 0 ], [ %rd640 + 0 ], 0x4, %r7819; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11150 + 0 ], [ %rd641 + 0 ], 0x4, %r7821; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11152 + 0 ], [ %rd642 + 0 ], 0x4, %r7823; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11154 + 0 ], [ %rd643 + 0 ], 0x4, %r7825; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11156 + 0 ], [ %rd644 + 0 ], 0x4, %r7827; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11158 + 0 ], [ %rd645 + 0 ], 0x4, %r7829; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11160 + 0 ], [ %rd646 + 0 ], 0x4, %r7831; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11162 + 0 ], [ %rd647 + 0 ], 0x4, %r7833; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11164 + 0 ], [ %rd648 + 0 ], 0x4, %r7835; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11166 + 0 ], [ %rd649 + 0 ], 0x4, %r7837; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11168 + 0 ], [ %rd650 + 0 ], 0x4, %r7839; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11170 + 0 ], [ %rd651 + 0 ], 0x4, %r7841; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11172 + 0 ], [ %rd652 + 0 ], 0x4, %r7843; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11174 + 0 ], [ %rd653 + 0 ], 0x4, %r7845; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11176 + 0 ], [ %rd654 + 0 ], 0x4, %r7847; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r11178 + 0 ], [ %rd1197 + 0 ], 0x10, %r7809; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11180 + 0 ], [ %rd1196 + 0 ], 0x10, %r7811; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11182 + 0 ], [ %rd1195 + 0 ], 0x10, %r7813; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11184 + 0 ], [ %rd1194 + 0 ], 0x10, %r7815; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd659, %rd143, %rd713; + add.s64 %rd660, %rd143, %rd714; + add.s64 %rd717, %rd143, %rd715; + add.s64 %rd661, %rd717, 32; + add.s64 %rd662, %rd717, 36; + add.s64 %rd663, %rd623, 256; + add.s64 %rd664, %rd624, 256; + add.s64 %rd665, %rd625, 256; + add.s64 %rd666, %rd626, 256; + add.s64 %rd667, %rd627, 256; + add.s64 %rd668, %rd628, 256; + add.s64 %rd669, %rd629, 256; + add.s64 %rd670, %rd630, 256; + add.s64 %rd671, %rd631, 256; + add.s64 %rd672, %rd632, 256; + add.s64 %rd673, %rd633, 256; + add.s64 %rd674, %rd634, 256; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11186 + 0 ], [ %rd659 + 0 ], 0x4, %r7817; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11188 + 0 ], [ %rd660 + 0 ], 0x4, %r7819; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11190 + 0 ], [ %rd661 + 0 ], 0x4, %r7821; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11192 + 0 ], [ %rd662 + 0 ], 0x4, %r7823; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11194 + 0 ], [ %rd663 + 0 ], 0x4, %r7825; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11196 + 0 ], [ %rd664 + 0 ], 0x4, %r7827; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11198 + 0 ], [ %rd665 + 0 ], 0x4, %r7829; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11200 + 0 ], [ %rd666 + 0 ], 0x4, %r7831; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11202 + 0 ], [ %rd667 + 0 ], 0x4, %r7833; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11204 + 0 ], [ %rd668 + 0 ], 0x4, %r7835; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11206 + 0 ], [ %rd669 + 0 ], 0x4, %r7837; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11208 + 0 ], [ %rd670 + 0 ], 0x4, %r7839; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11210 + 0 ], [ %rd671 + 0 ], 0x4, %r7841; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11212 + 0 ], [ %rd672 + 0 ], 0x4, %r7843; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11214 + 0 ], [ %rd673 + 0 ], 0x4, %r7845; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11216 + 0 ], [ %rd674 + 0 ], 0x4, %r7847; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + @%p618 bra $L__BB0_12; +// %bb.10: // %.lr.ph1620.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0:28 + mov.b32 %r8449, 0; + mov.b32 %r14899, 1; + mov.b32 %r14898, -1; + mov.b32 %r14883, 64; + mov.b32 %r14881, %r693; + mov.b32 %r14882, %r694; + mov.b32 %r14884, %r693; + mov.b32 %r14885, %r694; + mov.b32 %r14886, %r754; + mov.b32 %r14887, %r755; + mov.b32 %r14888, %r756; + mov.b32 %r14889, %r757; + mov.b32 %r14890, %r758; + mov.b32 %r14891, %r759; + mov.b32 %r14892, %r760; + mov.b32 %r14893, %r761; + mov.b32 %r14894, %r762; + mov.b32 %r14895, %r763; + mov.b32 %r14896, %r764; + mov.b32 %r14897, %r765; + mov.b32 %r14900, %r14898; + mov.b32 %r14901, %r14899; + mov.b32 %r14902, %r837; + mov.b32 %r14903, %r836; + mov.b32 %r14904, %r835; + mov.b32 %r14905, %r834; + mov.b32 %r14906, %r833; + mov.b32 %r14907, %r832; + mov.b32 %r14908, %r831; + mov.b32 %r14909, %r830; + mov.b32 %r14910, %r829; + mov.b32 %r14911, %r828; + mov.b32 %r14912, %r827; + mov.b32 %r14913, %r826; + mov.b32 %r14914, %r825; + mov.b32 %r14915, %r824; + mov.b32 %r14916, %r838; + mov.b32 %r14917, %r839; + mov.b32 %r14918, %r840; + mov.b32 %r14919, %r841; + mov.b32 %r14920, %r838; + mov.b32 %r14921, %r839; + mov.b32 %r14922, %r840; + mov.b32 %r14923, %r841; + mov.b32 %r15052, %r8449; + mov.b32 %r15053, %r818; + mov.b32 %r15054, %r819; + mov.b32 %r15055, %r818; + mov.b32 %r15056, %r819; + mov.b32 %r15057, %r822; + mov.b32 %r15058, %r823; + mov.b32 %r15059, %r765; + mov.b32 %r15060, %r764; + mov.b32 %r15061, %r763; + mov.b32 %r15062, %r762; + mov.b32 %r15063, %r761; + mov.b32 %r15064, %r760; + mov.b32 %r15065, %r759; + mov.b32 %r15066, %r758; + mov.b32 %r15067, %r757; + mov.b32 %r15068, %r756; + mov.b32 %r15069, %r755; + mov.b32 %r15070, %r754; +$L__BB0_11: // %.lr.ph1620 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p675, %r15052, %r902; + setp.lt.s32 %p641, %r15052, %r903; + add.s32 %r10148, %r14898, 1; + setp.gt.s32 %p676, %r10148, 1; + selp.b32 %r14898, 0, %r10148, %p676; + add.s32 %r10149, %r14900, 1; + setp.gt.s32 %p677, %r10149, 2; + selp.b32 %r14900, 0, %r10149, %p677; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p678, %r15055, %r2358; + setp.lt.s32 %p679, %r15056, %r2358; + setp.lt.s32 %p680, %r14884, %r2358; + setp.lt.s32 %p681, %r14885, %r2358; + setp.lt.s32 %p682, %r14886, %r2358; + setp.lt.s32 %p683, %r14887, %r2358; + setp.lt.s32 %p684, %r14888, %r2358; + setp.lt.s32 %p685, %r14889, %r2358; + setp.lt.s32 %p686, %r14890, %r2358; + setp.lt.s32 %p687, %r14891, %r2358; + setp.lt.s32 %p688, %r14892, %r2358; + setp.lt.s32 %p689, %r14893, %r2358; + setp.lt.s32 %p690, %r14894, %r2358; + setp.lt.s32 %p691, %r14895, %r2358; + setp.lt.s32 %p692, %r14896, %r2358; + setp.lt.s32 %p693, %r14897, %r2358; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r10150, %r14900, 14; + add.s32 %r8451, %r7585, %r10150; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10152, %r14898, 8; + add.s32 %r10153, %r7585, 98304; + add.s32 %r10154, %r10153, %r10152; + add.s32 %r10155, %r10154, %r767; + ld.shared.v2.b32 {%r10156, %r10157}, [%r10155]; + ld.shared.v2.b32 {%r10158, %r10159}, [%r10155+32]; + ld.shared.v2.b32 {%r10160, %r10161}, [%r10155+64]; + ld.shared.v2.b32 {%r10162, %r10163}, [%r10155+96]; + ld.shared.v2.b32 {%r10164, %r10165}, [%r10155+128]; + ld.shared.v2.b32 {%r10166, %r10167}, [%r10155+160]; + ld.shared.v2.b32 {%r10168, %r10169}, [%r10155+192]; + ld.shared.v2.b32 {%r10170, %r10171}, [%r10155+224]; + .loc 1 675 26 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:675:26 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.eq.f32 %p694, %r10156, 0fFF800000; + setp.eq.f32 %p695, %r10157, 0fFF800000; + setp.eq.f32 %p696, %r10158, 0fFF800000; + setp.eq.f32 %p697, %r10159, 0fFF800000; + setp.eq.f32 %p698, %r10160, 0fFF800000; + setp.eq.f32 %p699, %r10161, 0fFF800000; + setp.eq.f32 %p700, %r10162, 0fFF800000; + setp.eq.f32 %p701, %r10163, 0fFF800000; + setp.eq.f32 %p702, %r10164, 0fFF800000; + setp.eq.f32 %p703, %r10165, 0fFF800000; + setp.eq.f32 %p704, %r10166, 0fFF800000; + setp.eq.f32 %p705, %r10167, 0fFF800000; + setp.eq.f32 %p706, %r10168, 0fFF800000; + setp.eq.f32 %p707, %r10169, 0fFF800000; + setp.eq.f32 %p708, %r10170, 0fFF800000; + setp.eq.f32 %p709, %r10171, 0fFF800000; + .loc 1 675 46 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:675:46 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10172, 0f00000000, %r10156, %p694; + selp.f32 %r10173, 0f00000000, %r10157, %p695; + selp.f32 %r10174, 0f00000000, %r10158, %p696; + selp.f32 %r10175, 0f00000000, %r10159, %p697; + selp.f32 %r10176, 0f00000000, %r10160, %p698; + selp.f32 %r10177, 0f00000000, %r10161, %p699; + selp.f32 %r10178, 0f00000000, %r10162, %p700; + selp.f32 %r10179, 0f00000000, %r10163, %p701; + selp.f32 %r10180, 0f00000000, %r10164, %p702; + selp.f32 %r10181, 0f00000000, %r10165, %p703; + selp.f32 %r10182, 0f00000000, %r10166, %p704; + selp.f32 %r10183, 0f00000000, %r10167, %p705; + selp.f32 %r10184, 0f00000000, %r10168, %p706; + selp.f32 %r10185, 0f00000000, %r10169, %p707; + selp.f32 %r10186, 0f00000000, %r10170, %p708; + selp.f32 %r10187, 0f00000000, %r10171, %p709; + .loc 1 676 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:676:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shfl.sync.idx.b32 %r10188, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r10189, %r10188, 11; + and.b32 %r10190, %r10189, 8192; + add.s32 %r8410, %r7585, 99328; + add.s32 %r10191, %r10190, %r8410; + bfe.u32 %r10192, %r10191, 4, 14; + cvt.u64.u32 %rd804, %r10192; + or.b64 %rd718, %rd804, 4611686293372403712; + bfe.u32 %r10193, %r8451, 4, 14; + cvt.u64.u32 %rd805, %r10193; + or.b64 %rd719, %rd805, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd718, %rd719, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r10194, %r10190, 32; + add.s32 %r10195, %r10194, %r8410; + bfe.u32 %r10196, %r10195, 4, 14; + cvt.u64.u32 %rd806, %r10196; + or.b64 %rd720, %rd806, 4611686293372403712; + add.s32 %r10197, %r8451, 32; + bfe.u32 %r10198, %r10197, 4, 14; + cvt.u64.u32 %rd807, %r10198; + or.b64 %rd721, %rd807, 4611686293338849280; + mov.pred %p619, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd720, %rd721, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10199, %r10190, 64; + add.s32 %r10200, %r10199, %r8410; + bfe.u32 %r10201, %r10200, 4, 14; + cvt.u64.u32 %rd808, %r10201; + or.b64 %rd722, %rd808, 4611686293372403712; + add.s32 %r10202, %r8451, 64; + bfe.u32 %r10203, %r10202, 4, 14; + cvt.u64.u32 %rd809, %r10203; + or.b64 %rd723, %rd809, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd722, %rd723, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10204, %r10190, 96; + add.s32 %r10205, %r10204, %r8410; + bfe.u32 %r10206, %r10205, 4, 14; + cvt.u64.u32 %rd810, %r10206; + or.b64 %rd724, %rd810, 4611686293372403712; + add.s32 %r10207, %r8451, 96; + bfe.u32 %r10208, %r10207, 4, 14; + cvt.u64.u32 %rd811, %r10208; + or.b64 %rd725, %rd811, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd724, %rd725, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10209, %r10190, 16384; + add.s32 %r10210, %r10209, %r8410; + bfe.u32 %r10211, %r10210, 4, 14; + cvt.u64.u32 %rd812, %r10211; + or.b64 %rd726, %rd812, 4611686293372403712; + add.s32 %r10212, %r8451, 8192; + bfe.u32 %r10213, %r10212, 4, 14; + cvt.u64.u32 %rd813, %r10213; + or.b64 %rd727, %rd813, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd726, %rd727, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10214, %r10190, 16416; + add.s32 %r10215, %r10214, %r8410; + bfe.u32 %r10216, %r10215, 4, 14; + cvt.u64.u32 %rd814, %r10216; + or.b64 %rd728, %rd814, 4611686293372403712; + add.s32 %r10217, %r8451, 8224; + bfe.u32 %r10218, %r10217, 4, 14; + cvt.u64.u32 %rd815, %r10218; + or.b64 %rd729, %rd815, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd728, %rd729, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10219, %r10190, 16448; + add.s32 %r10220, %r10219, %r8410; + bfe.u32 %r10221, %r10220, 4, 14; + cvt.u64.u32 %rd816, %r10221; + or.b64 %rd730, %rd816, 4611686293372403712; + add.s32 %r10222, %r8451, 8256; + bfe.u32 %r10223, %r10222, 4, 14; + cvt.u64.u32 %rd817, %r10223; + or.b64 %rd731, %rd817, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd730, %rd731, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10224, %r10190, 16480; + add.s32 %r10225, %r10224, %r8410; + bfe.u32 %r10226, %r10225, 4, 14; + cvt.u64.u32 %rd818, %r10226; + or.b64 %rd732, %rd818, 4611686293372403712; + add.s32 %r10227, %r8451, 8288; + bfe.u32 %r10228, %r10227, 4, 14; + cvt.u64.u32 %rd819, %r10228; + or.b64 %rd733, %rd819, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd732, %rd733, %p619, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r8413, %r8451; + mov.b32 %r8411, %r8449; + mov.b32 %r8412, %r8449; + mov.b32 %r8414, %r8449; + mov.b32 %r8415, %r8449; + // begin inline asm + // wait for regs: %r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025,%r8410,%r8411,%r8412,%r8413,%r8414,%r8415 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:678:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10229, %r7994, 0f3DB504F3; + mul.f32 %r10230, %r7995, 0f3DB504F3; + mul.f32 %r10231, %r7996, 0f3DB504F3; + mul.f32 %r10232, %r7997, 0f3DB504F3; + mul.f32 %r10233, %r7998, 0f3DB504F3; + mul.f32 %r10234, %r7999, 0f3DB504F3; + mul.f32 %r10235, %r8000, 0f3DB504F3; + mul.f32 %r10236, %r8001, 0f3DB504F3; + mul.f32 %r10237, %r8002, 0f3DB504F3; + mul.f32 %r10238, %r8003, 0f3DB504F3; + mul.f32 %r10239, %r8004, 0f3DB504F3; + mul.f32 %r10240, %r8005, 0f3DB504F3; + mul.f32 %r10241, %r8006, 0f3DB504F3; + mul.f32 %r10242, %r8007, 0f3DB504F3; + mul.f32 %r10243, %r8008, 0f3DB504F3; + mul.f32 %r10244, %r8009, 0f3DB504F3; + mul.f32 %r10245, %r8010, 0f3DB504F3; + mul.f32 %r10246, %r8011, 0f3DB504F3; + mul.f32 %r10247, %r8012, 0f3DB504F3; + mul.f32 %r10248, %r8013, 0f3DB504F3; + mul.f32 %r10249, %r8014, 0f3DB504F3; + mul.f32 %r10250, %r8015, 0f3DB504F3; + mul.f32 %r10251, %r8016, 0f3DB504F3; + mul.f32 %r10252, %r8017, 0f3DB504F3; + mul.f32 %r10253, %r8018, 0f3DB504F3; + mul.f32 %r10254, %r8019, 0f3DB504F3; + mul.f32 %r10255, %r8020, 0f3DB504F3; + mul.f32 %r10256, %r8021, 0f3DB504F3; + mul.f32 %r10257, %r8022, 0f3DB504F3; + mul.f32 %r10258, %r8023, 0f3DB504F3; + mul.f32 %r10259, %r8024, 0f3DB504F3; + mul.f32 %r10260, %r8025, 0f3DB504F3; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10261, %r14881, %r2358; + rem.s32 %r10262, %r14882, %r2358; + .loc 1 698 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:698:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.le.s32 %p710, %r980, %r10261; + setp.le.s32 %p711, %r980, %r10262; + setp.le.s32 %p712, %r725, %r10261; + setp.le.s32 %p713, %r725, %r10262; + .loc 1 703 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:703:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.s64.s32 %rd820, %r10261; + cvt.s64.s32 %rd821, %r10262; + .loc 1 704 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:704:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.gt.s64 %p714, %rd561, %rd820; + setp.gt.s64 %p715, %rd561, %rd821; + .loc 1 706 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:706:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p716, %p710, %p714; + and.pred %p717, %p711, %p715; + and.pred %p718, %p712, %p714; + and.pred %p719, %p713, %p715; + .loc 1 722 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:722:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.s32 %r10263, %r980, %r10261; + sub.s32 %r10264, %r980, %r10262; + sub.s32 %r10265, %r725, %r10261; + sub.s32 %r10266, %r725, %r10262; + .loc 1 723 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:723:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10267, %r10263, %r2366; + rem.s32 %r10268, %r10264, %r2366; + rem.s32 %r10269, %r10265, %r2366; + rem.s32 %r10270, %r10266, %r2366; + .loc 1 724 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:724:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ne.b32 %p720, %r10267, 0; + setp.ne.b32 %p721, %r10268, 0; + setp.ne.b32 %p722, %r10269, 0; + setp.ne.b32 %p723, %r10270, 0; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10271, %r15070, %r2358; + rem.s32 %r10272, %r15069, %r2358; + rem.s32 %r10273, %r15068, %r2358; + rem.s32 %r10274, %r15067, %r2358; + .loc 1 698 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:698:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ge.s32 %p724, %r10271, %r980; + setp.ge.s32 %p725, %r10272, %r980; + setp.ge.s32 %p726, %r10271, %r725; + setp.ge.s32 %p727, %r10272, %r725; + setp.ge.s32 %p728, %r10273, %r980; + setp.ge.s32 %p729, %r10274, %r980; + setp.ge.s32 %p730, %r10273, %r725; + setp.ge.s32 %p731, %r10274, %r725; + .loc 1 703 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:703:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.s64.s32 %rd822, %r10271; + cvt.s64.s32 %rd823, %r10272; + cvt.s64.s32 %rd824, %r10273; + cvt.s64.s32 %rd825, %r10274; + .loc 1 704 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:704:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.gt.s64 %p732, %rd561, %rd822; + setp.gt.s64 %p733, %rd561, %rd823; + setp.gt.s64 %p734, %rd561, %rd824; + setp.gt.s64 %p735, %rd561, %rd825; + .loc 1 706 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:706:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p736, %p724, %p732; + and.pred %p737, %p725, %p733; + and.pred %p738, %p726, %p732; + and.pred %p739, %p727, %p733; + and.pred %p740, %p728, %p734; + and.pred %p741, %p729, %p735; + and.pred %p742, %p730, %p734; + and.pred %p743, %p731, %p735; + .loc 1 722 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:722:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.s32 %r10279, %r725, %r10274; + sub.s32 %r10280, %r725, %r10273; + sub.s32 %r10281, %r980, %r10274; + sub.s32 %r10282, %r980, %r10273; + sub.s32 %r10283, %r725, %r10272; + sub.s32 %r10284, %r725, %r10271; + sub.s32 %r10285, %r980, %r10272; + sub.s32 %r10286, %r980, %r10271; + .loc 1 723 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:723:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10287, %r10286, %r2366; + rem.s32 %r10288, %r10285, %r2366; + rem.s32 %r10289, %r10284, %r2366; + rem.s32 %r10290, %r10283, %r2366; + rem.s32 %r10291, %r10282, %r2366; + rem.s32 %r10292, %r10281, %r2366; + rem.s32 %r10293, %r10280, %r2366; + rem.s32 %r10294, %r10279, %r2366; + .loc 1 724 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:724:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ne.b32 %p744, %r10294, 0; + setp.ne.b32 %p745, %r10293, 0; + setp.ne.b32 %p746, %r10292, 0; + setp.ne.b32 %p747, %r10291, 0; + setp.ne.b32 %p748, %r10290, 0; + setp.ne.b32 %p749, %r10289, 0; + setp.ne.b32 %p750, %r10288, 0; + setp.ne.b32 %p751, %r10287, 0; + .loc 1 726 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:726:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + xor.b32 %r10295, %r10267, %r2366; + xor.b32 %r10296, %r10268, %r2366; + xor.b32 %r10297, %r10269, %r2366; + xor.b32 %r10298, %r10293, %r2366; + xor.b32 %r10299, %r10292, %r2366; + xor.b32 %r10300, %r10291, %r2366; + xor.b32 %r10301, %r10290, %r2366; + xor.b32 %r10302, %r10289, %r2366; + xor.b32 %r10303, %r10288, %r2366; + xor.b32 %r10304, %r10287, %r2366; + xor.b32 %r10305, %r10270, %r2366; + xor.b32 %r10306, %r10294, %r2366; + .loc 1 729 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:729:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shr.s32 %r10307, %r10295, 31; + and.b32 %r10308, %r10307, %r2366; + selp.b32 %r10309, %r10308, 0, %p720; + shr.s32 %r10310, %r10296, 31; + and.b32 %r10311, %r10310, %r2366; + selp.b32 %r10312, %r10311, 0, %p721; + shr.s32 %r10313, %r10297, 31; + and.b32 %r10314, %r10313, %r2366; + selp.b32 %r10315, %r10314, 0, %p722; + shr.s32 %r10316, %r10305, 31; + and.b32 %r10317, %r10316, %r2366; + selp.b32 %r10318, %r10317, 0, %p723; + shr.s32 %r10319, %r10304, 31; + and.b32 %r10320, %r10319, %r2366; + selp.b32 %r10321, %r10320, 0, %p751; + shr.s32 %r10322, %r10303, 31; + and.b32 %r10323, %r10322, %r2366; + selp.b32 %r10324, %r10323, 0, %p750; + shr.s32 %r10325, %r10302, 31; + and.b32 %r10326, %r10325, %r2366; + selp.b32 %r10327, %r10326, 0, %p749; + shr.s32 %r10328, %r10301, 31; + and.b32 %r10329, %r10328, %r2366; + selp.b32 %r10330, %r10329, 0, %p748; + shr.s32 %r10331, %r10300, 31; + and.b32 %r10332, %r10331, %r2366; + selp.b32 %r10333, %r10332, 0, %p747; + shr.s32 %r10334, %r10299, 31; + and.b32 %r10335, %r10334, %r2366; + selp.b32 %r10336, %r10335, 0, %p746; + shr.s32 %r10337, %r10298, 31; + and.b32 %r10338, %r10337, %r2366; + selp.b32 %r10339, %r10338, 0, %p745; + shr.s32 %r10340, %r10306, 31; + and.b32 %r10341, %r10340, %r2366; + selp.b32 %r10342, %r10341, 0, %p744; + .loc 1 730 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:730:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + neg.s32 %r10343, %r10309; + setp.eq.b32 %p752, %r10267, %r10343; + neg.s32 %r10344, %r10312; + setp.eq.b32 %p753, %r10268, %r10344; + neg.s32 %r10345, %r10315; + setp.eq.b32 %p754, %r10269, %r10345; + neg.s32 %r10346, %r10318; + setp.eq.b32 %p755, %r10270, %r10346; + neg.s32 %r10347, %r10321; + setp.eq.b32 %p756, %r10287, %r10347; + neg.s32 %r10348, %r10324; + setp.eq.b32 %p757, %r10288, %r10348; + neg.s32 %r10349, %r10327; + setp.eq.b32 %p758, %r10289, %r10349; + neg.s32 %r10350, %r10330; + setp.eq.b32 %p759, %r10290, %r10350; + neg.s32 %r10351, %r10333; + setp.eq.b32 %p760, %r10291, %r10351; + neg.s32 %r10352, %r10336; + setp.eq.b32 %p761, %r10292, %r10352; + neg.s32 %r10353, %r10339; + setp.eq.b32 %p762, %r10293, %r10353; + neg.s32 %r10354, %r10342; + setp.eq.b32 %p763, %r10294, %r10354; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10355, %r15059, %r2358; + rem.s32 %r10356, %r15060, %r2358; + rem.s32 %r10357, %r15061, %r2358; + rem.s32 %r10358, %r15062, %r2358; + rem.s32 %r10359, %r15063, %r2358; + rem.s32 %r10360, %r15064, %r2358; + rem.s32 %r10361, %r15065, %r2358; + rem.s32 %r10362, %r15066, %r2358; + .loc 1 698 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:698:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ge.s32 %p764, %r10362, %r980; + setp.ge.s32 %p765, %r10361, %r980; + setp.ge.s32 %p766, %r10362, %r725; + setp.ge.s32 %p767, %r10361, %r725; + setp.ge.s32 %p768, %r10360, %r980; + setp.ge.s32 %p769, %r10359, %r980; + setp.ge.s32 %p770, %r10360, %r725; + setp.ge.s32 %p771, %r10359, %r725; + setp.ge.s32 %p772, %r10358, %r980; + setp.ge.s32 %p773, %r10357, %r980; + setp.ge.s32 %p774, %r10358, %r725; + setp.ge.s32 %p775, %r10357, %r725; + setp.ge.s32 %p776, %r10356, %r980; + setp.ge.s32 %p777, %r10355, %r980; + setp.ge.s32 %p778, %r10356, %r725; + setp.ge.s32 %p779, %r10355, %r725; + .loc 1 704 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:704:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.s64.s32 %rd826, %r10362; + cvt.s64.s32 %rd827, %r10361; + cvt.s64.s32 %rd828, %r10360; + cvt.s64.s32 %rd829, %r10359; + cvt.s64.s32 %rd830, %r10358; + cvt.s64.s32 %rd831, %r10357; + cvt.s64.s32 %rd832, %r10356; + cvt.s64.s32 %rd833, %r10355; + setp.gt.s64 %p780, %rd561, %rd833; + setp.gt.s64 %p781, %rd561, %rd832; + setp.gt.s64 %p782, %rd561, %rd831; + setp.gt.s64 %p783, %rd561, %rd830; + setp.gt.s64 %p784, %rd561, %rd829; + setp.gt.s64 %p785, %rd561, %rd828; + setp.gt.s64 %p786, %rd561, %rd827; + setp.gt.s64 %p787, %rd561, %rd826; + .loc 1 706 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:706:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p788, %p764, %p787; + and.pred %p789, %p765, %p786; + and.pred %p790, %p766, %p787; + and.pred %p791, %p767, %p786; + and.pred %p792, %p768, %p785; + and.pred %p793, %p769, %p784; + and.pred %p794, %p770, %p785; + and.pred %p795, %p771, %p784; + and.pred %p796, %p772, %p783; + and.pred %p797, %p773, %p782; + and.pred %p798, %p774, %p783; + and.pred %p799, %p775, %p782; + and.pred %p800, %p776, %p781; + and.pred %p801, %p777, %p780; + and.pred %p802, %p778, %p781; + and.pred %p803, %p779, %p780; + .loc 1 722 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:722:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.s32 %r10363, %r980, %r10362; + sub.s32 %r10364, %r980, %r10361; + sub.s32 %r10365, %r725, %r10362; + sub.s32 %r10366, %r725, %r10361; + sub.s32 %r10367, %r980, %r10360; + sub.s32 %r10368, %r980, %r10359; + sub.s32 %r10369, %r725, %r10360; + sub.s32 %r10370, %r725, %r10359; + sub.s32 %r10371, %r980, %r10358; + sub.s32 %r10372, %r980, %r10357; + sub.s32 %r10373, %r725, %r10358; + sub.s32 %r10374, %r725, %r10357; + sub.s32 %r10375, %r980, %r10356; + sub.s32 %r10376, %r980, %r10355; + sub.s32 %r10377, %r725, %r10356; + sub.s32 %r10378, %r725, %r10355; + .loc 1 723 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:723:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10379, %r10378, %r2366; + rem.s32 %r10380, %r10377, %r2366; + rem.s32 %r10381, %r10376, %r2366; + rem.s32 %r10382, %r10375, %r2366; + rem.s32 %r10383, %r10374, %r2366; + rem.s32 %r10384, %r10373, %r2366; + rem.s32 %r10385, %r10372, %r2366; + rem.s32 %r10386, %r10371, %r2366; + rem.s32 %r10387, %r10370, %r2366; + rem.s32 %r10388, %r10369, %r2366; + rem.s32 %r10389, %r10368, %r2366; + rem.s32 %r10390, %r10367, %r2366; + rem.s32 %r10391, %r10366, %r2366; + rem.s32 %r10392, %r10365, %r2366; + rem.s32 %r10393, %r10364, %r2366; + rem.s32 %r10394, %r10363, %r2366; + .loc 1 724 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:724:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ne.b32 %p804, %r10394, 0; + setp.ne.b32 %p805, %r10393, 0; + setp.ne.b32 %p806, %r10392, 0; + setp.ne.b32 %p807, %r10391, 0; + setp.ne.b32 %p808, %r10390, 0; + setp.ne.b32 %p809, %r10389, 0; + setp.ne.b32 %p810, %r10388, 0; + setp.ne.b32 %p811, %r10387, 0; + setp.ne.b32 %p812, %r10386, 0; + setp.ne.b32 %p813, %r10385, 0; + setp.ne.b32 %p814, %r10384, 0; + setp.ne.b32 %p815, %r10383, 0; + setp.ne.b32 %p816, %r10382, 0; + setp.ne.b32 %p817, %r10381, 0; + setp.ne.b32 %p818, %r10380, 0; + setp.ne.b32 %p819, %r10379, 0; + .loc 1 726 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:726:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + xor.b32 %r10395, %r10394, %r2366; + xor.b32 %r10396, %r10393, %r2366; + xor.b32 %r10397, %r10392, %r2366; + xor.b32 %r10398, %r10391, %r2366; + xor.b32 %r10399, %r10390, %r2366; + xor.b32 %r10400, %r10389, %r2366; + xor.b32 %r10401, %r10388, %r2366; + xor.b32 %r10402, %r10387, %r2366; + xor.b32 %r10403, %r10386, %r2366; + xor.b32 %r10404, %r10385, %r2366; + xor.b32 %r10405, %r10384, %r2366; + xor.b32 %r10406, %r10383, %r2366; + xor.b32 %r10407, %r10382, %r2366; + xor.b32 %r10408, %r10381, %r2366; + xor.b32 %r10409, %r10380, %r2366; + xor.b32 %r10410, %r10379, %r2366; + .loc 1 729 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:729:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shr.s32 %r10411, %r10410, 31; + and.b32 %r10412, %r10411, %r2366; + selp.b32 %r10413, %r10412, 0, %p819; + shr.s32 %r10414, %r10409, 31; + and.b32 %r10415, %r10414, %r2366; + selp.b32 %r10416, %r10415, 0, %p818; + shr.s32 %r10417, %r10408, 31; + and.b32 %r10418, %r10417, %r2366; + selp.b32 %r10419, %r10418, 0, %p817; + shr.s32 %r10420, %r10407, 31; + and.b32 %r10421, %r10420, %r2366; + selp.b32 %r10422, %r10421, 0, %p816; + shr.s32 %r10423, %r10406, 31; + and.b32 %r10424, %r10423, %r2366; + selp.b32 %r10425, %r10424, 0, %p815; + shr.s32 %r10426, %r10405, 31; + and.b32 %r10427, %r10426, %r2366; + selp.b32 %r10428, %r10427, 0, %p814; + shr.s32 %r10429, %r10404, 31; + and.b32 %r10430, %r10429, %r2366; + selp.b32 %r10431, %r10430, 0, %p813; + shr.s32 %r10432, %r10403, 31; + and.b32 %r10433, %r10432, %r2366; + selp.b32 %r10434, %r10433, 0, %p812; + shr.s32 %r10435, %r10402, 31; + and.b32 %r10436, %r10435, %r2366; + selp.b32 %r10437, %r10436, 0, %p811; + shr.s32 %r10438, %r10401, 31; + and.b32 %r10439, %r10438, %r2366; + selp.b32 %r10440, %r10439, 0, %p810; + shr.s32 %r10441, %r10400, 31; + and.b32 %r10442, %r10441, %r2366; + selp.b32 %r10443, %r10442, 0, %p809; + shr.s32 %r10444, %r10399, 31; + and.b32 %r10445, %r10444, %r2366; + selp.b32 %r10446, %r10445, 0, %p808; + shr.s32 %r10447, %r10398, 31; + and.b32 %r10448, %r10447, %r2366; + selp.b32 %r10449, %r10448, 0, %p807; + shr.s32 %r10450, %r10397, 31; + and.b32 %r10451, %r10450, %r2366; + selp.b32 %r10452, %r10451, 0, %p806; + shr.s32 %r10453, %r10396, 31; + and.b32 %r10454, %r10453, %r2366; + selp.b32 %r10455, %r10454, 0, %p805; + shr.s32 %r10456, %r10395, 31; + and.b32 %r10457, %r10456, %r2366; + selp.b32 %r10458, %r10457, 0, %p804; + .loc 1 730 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:730:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + neg.s32 %r10459, %r10458; + neg.s32 %r10460, %r10455; + neg.s32 %r10461, %r10452; + neg.s32 %r10462, %r10449; + neg.s32 %r10463, %r10446; + neg.s32 %r10464, %r10443; + neg.s32 %r10465, %r10440; + neg.s32 %r10466, %r10437; + neg.s32 %r10467, %r10434; + neg.s32 %r10468, %r10431; + neg.s32 %r10469, %r10428; + neg.s32 %r10470, %r10425; + neg.s32 %r10471, %r10422; + neg.s32 %r10472, %r10419; + neg.s32 %r10473, %r10416; + neg.s32 %r10474, %r10413; + setp.eq.b32 %p820, %r10379, %r10474; + setp.eq.b32 %p821, %r10380, %r10473; + setp.eq.b32 %p822, %r10381, %r10472; + setp.eq.b32 %p823, %r10382, %r10471; + setp.eq.b32 %p824, %r10383, %r10470; + setp.eq.b32 %p825, %r10384, %r10469; + setp.eq.b32 %p826, %r10385, %r10468; + setp.eq.b32 %p827, %r10386, %r10467; + setp.eq.b32 %p828, %r10387, %r10466; + setp.eq.b32 %p829, %r10388, %r10465; + setp.eq.b32 %p830, %r10389, %r10464; + setp.eq.b32 %p831, %r10390, %r10463; + setp.eq.b32 %p832, %r10391, %r10462; + setp.eq.b32 %p833, %r10392, %r10461; + setp.eq.b32 %p834, %r10393, %r10460; + setp.eq.b32 %p835, %r10394, %r10459; + .loc 1 731 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:731:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p836, %p9, %p752; + and.pred %p837, %p9, %p753; + and.pred %p838, %p7, %p754; + and.pred %p839, %p7, %p755; + and.pred %p840, %p9, %p756; + and.pred %p841, %p9, %p757; + and.pred %p842, %p7, %p758; + and.pred %p843, %p7, %p759; + and.pred %p844, %p9, %p760; + and.pred %p845, %p9, %p761; + and.pred %p846, %p7, %p762; + and.pred %p847, %p7, %p763; + and.pred %p848, %p9, %p835; + and.pred %p849, %p9, %p834; + and.pred %p850, %p7, %p833; + and.pred %p851, %p7, %p832; + and.pred %p852, %p9, %p831; + and.pred %p853, %p9, %p830; + and.pred %p854, %p7, %p829; + and.pred %p855, %p7, %p828; + and.pred %p856, %p9, %p827; + and.pred %p857, %p9, %p826; + and.pred %p858, %p7, %p825; + and.pred %p859, %p7, %p824; + and.pred %p860, %p9, %p823; + and.pred %p861, %p9, %p822; + and.pred %p862, %p7, %p821; + and.pred %p863, %p7, %p820; + .loc 1 732 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:732:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + or.pred %p864, %p716, %p836; + or.pred %p865, %p717, %p837; + or.pred %p866, %p718, %p838; + or.pred %p867, %p719, %p839; + or.pred %p868, %p736, %p840; + or.pred %p869, %p737, %p841; + or.pred %p870, %p738, %p842; + or.pred %p871, %p739, %p843; + or.pred %p872, %p740, %p844; + or.pred %p873, %p741, %p845; + or.pred %p874, %p742, %p846; + or.pred %p875, %p743, %p847; + or.pred %p876, %p788, %p848; + or.pred %p877, %p789, %p849; + or.pred %p878, %p790, %p850; + or.pred %p879, %p791, %p851; + or.pred %p880, %p792, %p852; + or.pred %p881, %p793, %p853; + or.pred %p882, %p794, %p854; + or.pred %p883, %p795, %p855; + or.pred %p884, %p796, %p856; + or.pred %p885, %p797, %p857; + or.pred %p886, %p798, %p858; + or.pred %p887, %p799, %p859; + or.pred %p888, %p800, %p860; + or.pred %p889, %p801, %p861; + or.pred %p890, %p802, %p862; + or.pred %p891, %p803, %p863; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10475, %r15054, %r2358; + rem.s32 %r10476, %r15053, %r2358; + .loc 1 698 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:698:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.le.s32 %p892, %r725, %r10476; + setp.le.s32 %p893, %r725, %r10475; + .loc 1 704 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:704:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.s64.s32 %rd834, %r10475; + cvt.s64.s32 %rd835, %r10476; + setp.gt.s64 %p894, %rd561, %rd835; + setp.gt.s64 %p895, %rd561, %rd834; + .loc 1 706 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:706:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p896, %p893, %p895; + and.pred %p897, %p892, %p894; + .loc 1 722 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:722:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.s32 %r10477, %r725, %r10475; + sub.s32 %r10478, %r725, %r10476; + .loc 1 723 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:723:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10479, %r10478, %r2366; + rem.s32 %r10480, %r10477, %r2366; + .loc 1 724 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:724:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ne.b32 %p898, %r10480, 0; + setp.ne.b32 %p899, %r10479, 0; + .loc 1 726 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:726:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + xor.b32 %r10481, %r10480, %r2366; + xor.b32 %r10482, %r10479, %r2366; + .loc 1 729 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:729:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shr.s32 %r10483, %r10482, 31; + and.b32 %r10484, %r10483, %r2366; + selp.b32 %r10485, %r10484, 0, %p899; + shr.s32 %r10486, %r10481, 31; + and.b32 %r10487, %r10486, %r2366; + selp.b32 %r10488, %r10487, 0, %p898; + .loc 1 730 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:730:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + neg.s32 %r10489, %r10488; + neg.s32 %r10490, %r10485; + setp.eq.b32 %p900, %r10479, %r10490; + setp.eq.b32 %p901, %r10480, %r10489; + .loc 1 731 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:731:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p902, %p7, %p901; + and.pred %p903, %p7, %p900; + .loc 1 732 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:732:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + or.pred %p904, %p897, %p903; + or.pred %p905, %p896, %p902; + .loc 1 698 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:698:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.le.s32 %p906, %r980, %r10476; + setp.le.s32 %p907, %r980, %r10475; + .loc 1 706 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:706:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p908, %p907, %p895; + and.pred %p909, %p906, %p894; + .loc 1 722 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:722:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.s32 %r10491, %r980, %r10475; + sub.s32 %r10492, %r980, %r10476; + .loc 1 723 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:723:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + rem.s32 %r10493, %r10492, %r2366; + rem.s32 %r10494, %r10491, %r2366; + .loc 1 724 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:724:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ne.b32 %p910, %r10494, 0; + setp.ne.b32 %p911, %r10493, 0; + .loc 1 726 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:726:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + xor.b32 %r10495, %r10494, %r2366; + xor.b32 %r10496, %r10493, %r2366; + .loc 1 729 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:729:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shr.s32 %r10497, %r10496, 31; + and.b32 %r10498, %r10497, %r2366; + selp.b32 %r10499, %r10498, 0, %p911; + shr.s32 %r10500, %r10495, 31; + and.b32 %r10501, %r10500, %r2366; + selp.b32 %r10502, %r10501, 0, %p910; + .loc 1 730 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:730:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + neg.s32 %r10503, %r10502; + neg.s32 %r10504, %r10499; + setp.eq.b32 %p912, %r10493, %r10504; + setp.eq.b32 %p913, %r10494, %r10503; + .loc 1 731 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:731:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p914, %p9, %p913; + and.pred %p915, %p9, %p912; + .loc 1 732 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:732:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + or.pred %p916, %p909, %p915; + or.pred %p917, %p908, %p914; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p918, %p917, %p679; + and.pred %p919, %p916, %p678; + and.pred %p920, %p905, %p679; + and.pred %p921, %p904, %p678; + and.pred %p922, %p864, %p680; + and.pred %p923, %p865, %p681; + and.pred %p924, %p866, %p680; + and.pred %p925, %p867, %p681; + and.pred %p926, %p868, %p682; + and.pred %p927, %p869, %p683; + and.pred %p928, %p870, %p682; + and.pred %p929, %p871, %p683; + and.pred %p930, %p872, %p684; + and.pred %p931, %p873, %p685; + and.pred %p932, %p874, %p684; + and.pred %p933, %p875, %p685; + and.pred %p934, %p876, %p686; + and.pred %p935, %p877, %p687; + and.pred %p936, %p878, %p686; + and.pred %p937, %p879, %p687; + and.pred %p938, %p880, %p688; + and.pred %p939, %p881, %p689; + and.pred %p940, %p882, %p688; + and.pred %p941, %p883, %p689; + and.pred %p942, %p884, %p690; + and.pred %p943, %p885, %p691; + and.pred %p944, %p886, %p690; + and.pred %p945, %p887, %p691; + and.pred %p946, %p888, %p692; + and.pred %p947, %p889, %p693; + and.pred %p948, %p890, %p692; + and.pred %p949, %p891, %p693; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10505, %r10229, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10506, %r10505, 0fFF800000, %p919; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10507, %r10230, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10508, %r10507, 0fFF800000, %p918; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10509, %r10231, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10510, %r10509, 0fFF800000, %p921; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10511, %r10232, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10512, %r10511, 0fFF800000, %p920; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10513, %r10233, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10514, %r10513, 0fFF800000, %p922; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10515, %r10234, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10516, %r10515, 0fFF800000, %p923; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10517, %r10235, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10518, %r10517, 0fFF800000, %p924; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10519, %r10236, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10520, %r10519, 0fFF800000, %p925; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10521, %r10237, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10522, %r10521, 0fFF800000, %p926; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10523, %r10238, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10524, %r10523, 0fFF800000, %p927; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10525, %r10239, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10526, %r10525, 0fFF800000, %p928; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10527, %r10240, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10528, %r10527, 0fFF800000, %p929; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10529, %r10241, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10530, %r10529, 0fFF800000, %p930; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10531, %r10242, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10532, %r10531, 0fFF800000, %p931; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10533, %r10243, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10534, %r10533, 0fFF800000, %p932; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10535, %r10244, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10536, %r10535, 0fFF800000, %p933; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10537, %r10245, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10538, %r10537, 0fFF800000, %p934; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10539, %r10246, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10540, %r10539, 0fFF800000, %p935; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10541, %r10247, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10542, %r10541, 0fFF800000, %p936; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10543, %r10248, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10544, %r10543, 0fFF800000, %p937; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10545, %r10249, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10546, %r10545, 0fFF800000, %p938; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10547, %r10250, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10548, %r10547, 0fFF800000, %p939; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10549, %r10251, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10550, %r10549, 0fFF800000, %p940; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10551, %r10252, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10552, %r10551, 0fFF800000, %p941; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10553, %r10253, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10554, %r10553, 0fFF800000, %p942; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10555, %r10254, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10556, %r10555, 0fFF800000, %p943; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10557, %r10255, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10558, %r10557, 0fFF800000, %p944; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10559, %r10256, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10560, %r10559, 0fFF800000, %p945; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10561, %r10257, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10562, %r10561, 0fFF800000, %p946; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10563, %r10258, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10564, %r10563, 0fFF800000, %p947; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10565, %r10259, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10566, %r10565, 0fFF800000, %p948; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10567, %r10260, 0f3FB8AA3B; + .loc 1 736 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:736:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.f32 %r10568, %r10567, 0fFF800000, %p949; + .loc 1 740 40 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:740:40 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.f32 %r10569, %r10506, %r10172; + sub.f32 %r10570, %r10508, %r10173; + sub.f32 %r10571, %r10510, %r10172; + sub.f32 %r10572, %r10512, %r10173; + sub.f32 %r10573, %r10514, %r10174; + sub.f32 %r10574, %r10516, %r10175; + sub.f32 %r10575, %r10518, %r10174; + sub.f32 %r10576, %r10520, %r10175; + sub.f32 %r10577, %r10522, %r10176; + sub.f32 %r10578, %r10524, %r10177; + sub.f32 %r10579, %r10526, %r10176; + sub.f32 %r10580, %r10528, %r10177; + sub.f32 %r10581, %r10530, %r10178; + sub.f32 %r10582, %r10532, %r10179; + sub.f32 %r10583, %r10534, %r10178; + sub.f32 %r10584, %r10536, %r10179; + sub.f32 %r10585, %r10538, %r10180; + sub.f32 %r10586, %r10540, %r10181; + sub.f32 %r10587, %r10542, %r10180; + sub.f32 %r10588, %r10544, %r10181; + sub.f32 %r10589, %r10546, %r10182; + sub.f32 %r10590, %r10548, %r10183; + sub.f32 %r10591, %r10550, %r10182; + sub.f32 %r10592, %r10552, %r10183; + sub.f32 %r10593, %r10554, %r10184; + sub.f32 %r10594, %r10556, %r10185; + sub.f32 %r10595, %r10558, %r10184; + sub.f32 %r10596, %r10560, %r10185; + sub.f32 %r10597, %r10562, %r10186; + sub.f32 %r10598, %r10564, %r10187; + sub.f32 %r10599, %r10566, %r10186; + sub.f32 %r10600, %r10568, %r10187; + .loc 1 740 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:740:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + ex2.approx.ftz.f32 %r10601, %r10569; + ex2.approx.ftz.f32 %r10602, %r10570; + ex2.approx.ftz.f32 %r10603, %r10571; + ex2.approx.ftz.f32 %r10604, %r10572; + ex2.approx.ftz.f32 %r10605, %r10573; + ex2.approx.ftz.f32 %r10606, %r10574; + ex2.approx.ftz.f32 %r10607, %r10575; + ex2.approx.ftz.f32 %r10608, %r10576; + ex2.approx.ftz.f32 %r10609, %r10577; + ex2.approx.ftz.f32 %r10610, %r10578; + ex2.approx.ftz.f32 %r10611, %r10579; + ex2.approx.ftz.f32 %r10612, %r10580; + ex2.approx.ftz.f32 %r10613, %r10581; + ex2.approx.ftz.f32 %r10614, %r10582; + ex2.approx.ftz.f32 %r10615, %r10583; + ex2.approx.ftz.f32 %r10616, %r10584; + ex2.approx.ftz.f32 %r10617, %r10585; + ex2.approx.ftz.f32 %r10618, %r10586; + ex2.approx.ftz.f32 %r10619, %r10587; + ex2.approx.ftz.f32 %r10620, %r10588; + ex2.approx.ftz.f32 %r10621, %r10589; + ex2.approx.ftz.f32 %r10622, %r10590; + ex2.approx.ftz.f32 %r10623, %r10591; + ex2.approx.ftz.f32 %r10624, %r10592; + ex2.approx.ftz.f32 %r10625, %r10593; + ex2.approx.ftz.f32 %r10626, %r10594; + ex2.approx.ftz.f32 %r10627, %r10595; + ex2.approx.ftz.f32 %r10628, %r10596; + ex2.approx.ftz.f32 %r10629, %r10597; + ex2.approx.ftz.f32 %r10630, %r10598; + ex2.approx.ftz.f32 %r10631, %r10599; + ex2.approx.ftz.f32 %r10632, %r10600; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10633, %r7585, 49152; + add.s32 %r9497, %r10633, %r10150; + .loc 1 744 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:744:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16x2.f32 %r8582, %r10602, %r10601; + cvt.rn.bf16x2.f32 %r8583, %r10604, %r10603; + cvt.rn.bf16x2.f32 %r8584, %r10606, %r10605; + cvt.rn.bf16x2.f32 %r8585, %r10608, %r10607; + cvt.rn.bf16x2.f32 %r8714, %r10610, %r10609; + cvt.rn.bf16x2.f32 %r8715, %r10612, %r10611; + cvt.rn.bf16x2.f32 %r8716, %r10614, %r10613; + cvt.rn.bf16x2.f32 %r8717, %r10616, %r10615; + cvt.rn.bf16x2.f32 %r8846, %r10618, %r10617; + cvt.rn.bf16x2.f32 %r8847, %r10620, %r10619; + cvt.rn.bf16x2.f32 %r8848, %r10622, %r10621; + cvt.rn.bf16x2.f32 %r8849, %r10624, %r10623; + cvt.rn.bf16x2.f32 %r8978, %r10626, %r10625; + cvt.rn.bf16x2.f32 %r8979, %r10628, %r10627; + cvt.rn.bf16x2.f32 %r8980, %r10630, %r10629; + cvt.rn.bf16x2.f32 %r8981, %r10632, %r10631; + .loc 1 744 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:744:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + wgmma.fence.sync.aligned; + bfe.u32 %r10634, %r9497, 4, 14; + cvt.u64.u32 %rd836, %r10634; + or.b64 %rd734, %rd836, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8582,%r8583,%r8584,%r8585}, %rd734, %p619, 1, 1, 1; + // end inline asm + add.s32 %r10635, %r9497, 2048; + bfe.u32 %r10636, %r10635, 4, 14; + cvt.u64.u32 %rd837, %r10636; + or.b64 %rd735, %rd837, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8714,%r8715,%r8716,%r8717}, %rd735, %p619, 1, 1, 1; + // end inline asm + add.s32 %r10637, %r9497, 4096; + bfe.u32 %r10638, %r10637, 4, 14; + cvt.u64.u32 %rd838, %r10638; + or.b64 %rd736, %rd838, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8846,%r8847,%r8848,%r8849}, %rd736, %p619, 1, 1, 1; + // end inline asm + add.s32 %r10639, %r9497, 6144; + bfe.u32 %r10640, %r10639, 4, 14; + cvt.u64.u32 %rd839, %r10640; + or.b64 %rd737, %rd839, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8978,%r8979,%r8980,%r8981}, %rd737, %p619, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10641, %r7585, 98816; + add.s32 %r10642, %r10641, %r10152; + add.s32 %r10643, %r10642, %r767; + ld.shared.v2.b32 {%r10644, %r10645}, [%r10643+32]; + ld.shared.v2.b32 {%r10646, %r10647}, [%r10643+64]; + ld.shared.v2.b32 {%r10648, %r10649}, [%r10643+96]; + ld.shared.v2.b32 {%r10650, %r10651}, [%r10643+128]; + ld.shared.v2.b32 {%r10652, %r10653}, [%r10643+160]; + ld.shared.v2.b32 {%r10654, %r10655}, [%r10643+192]; + ld.shared.v2.b32 {%r10656, %r10657}, [%r10643+224]; + .loc 1 750 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:750:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r9494, %r7585, 132096; + add.s32 %r10658, %r10190, %r9494; + bfe.u32 %r10659, %r10658, 4, 14; + cvt.u64.u32 %rd840, %r10659; + or.b64 %rd738, %rd840, 4611686293372403712; + add.s32 %r10660, %r10194, %r9494; + bfe.u32 %r10661, %r10660, 4, 14; + cvt.u64.u32 %rd841, %r10661; + or.b64 %rd740, %rd841, 4611686293372403712; + add.s32 %r10662, %r9497, 32; + bfe.u32 %r10663, %r10662, 4, 14; + cvt.u64.u32 %rd842, %r10663; + or.b64 %rd741, %rd842, 4611686293338849280; + add.s32 %r10664, %r10199, %r9494; + bfe.u32 %r10665, %r10664, 4, 14; + cvt.u64.u32 %rd843, %r10665; + or.b64 %rd742, %rd843, 4611686293372403712; + add.s32 %r10666, %r9497, 64; + bfe.u32 %r10667, %r10666, 4, 14; + cvt.u64.u32 %rd844, %r10667; + or.b64 %rd743, %rd844, 4611686293338849280; + add.s32 %r10668, %r10204, %r9494; + bfe.u32 %r10669, %r10668, 4, 14; + cvt.u64.u32 %rd845, %r10669; + or.b64 %rd744, %rd845, 4611686293372403712; + add.s32 %r10670, %r9497, 96; + bfe.u32 %r10671, %r10670, 4, 14; + cvt.u64.u32 %rd846, %r10671; + or.b64 %rd745, %rd846, 4611686293338849280; + add.s32 %r10672, %r10209, %r9494; + bfe.u32 %r10673, %r10672, 4, 14; + cvt.u64.u32 %rd847, %r10673; + or.b64 %rd746, %rd847, 4611686293372403712; + add.s32 %r10674, %r9497, 8192; + bfe.u32 %r10675, %r10674, 4, 14; + cvt.u64.u32 %rd848, %r10675; + or.b64 %rd747, %rd848, 4611686293338849280; + add.s32 %r10676, %r10214, %r9494; + bfe.u32 %r10677, %r10676, 4, 14; + cvt.u64.u32 %rd849, %r10677; + or.b64 %rd748, %rd849, 4611686293372403712; + add.s32 %r10678, %r9497, 8224; + bfe.u32 %r10679, %r10678, 4, 14; + cvt.u64.u32 %rd850, %r10679; + or.b64 %rd749, %rd850, 4611686293338849280; + add.s32 %r10680, %r10219, %r9494; + bfe.u32 %r10681, %r10680, 4, 14; + cvt.u64.u32 %rd851, %r10681; + or.b64 %rd750, %rd851, 4611686293372403712; + add.s32 %r10682, %r9497, 8256; + bfe.u32 %r10683, %r10682, 4, 14; + cvt.u64.u32 %rd852, %r10683; + or.b64 %rd751, %rd852, 4611686293338849280; + add.s32 %r10684, %r10224, %r9494; + bfe.u32 %r10685, %r10684, 4, 14; + cvt.u64.u32 %rd853, %r10685; + or.b64 %rd752, %rd853, 4611686293372403712; + add.s32 %r10686, %r9497, 8288; + bfe.u32 %r10687, %r10686, 4, 14; + cvt.u64.u32 %rd854, %r10687; + or.b64 %rd753, %rd854, 4611686293338849280; + .loc 1 775 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10688, %r8451, 2048; + bfe.u32 %r10689, %r10688, 4, 14; + cvt.u64.u32 %rd855, %r10689; + or.b64 %rd755, %rd855, 4611686293338849280; + add.s32 %r10690, %r8451, 4096; + bfe.u32 %r10691, %r10690, 4, 14; + cvt.u64.u32 %rd856, %r10691; + or.b64 %rd756, %rd856, 4611686293338849280; + add.s32 %r10692, %r8451, 6144; + bfe.u32 %r10693, %r10692, 4, 14; + cvt.u64.u32 %rd857, %r10693; + or.b64 %rd757, %rd857, 4611686293338849280; + .loc 1 628 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:628:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r15053, %r15053, %r14883; + add.s32 %r15054, %r15054, %r14883; + add.s32 %r14881, %r14881, %r14883; + add.s32 %r14882, %r14882, %r14883; + add.s32 %r15067, %r15067, %r14883; + add.s32 %r15068, %r15068, %r14883; + add.s32 %r15069, %r15069, %r14883; + add.s32 %r15070, %r15070, %r14883; + add.s32 %r15059, %r15059, %r14883; + add.s32 %r15060, %r15060, %r14883; + add.s32 %r15061, %r15061, %r14883; + add.s32 %r15062, %r15062, %r14883; + add.s32 %r15063, %r15063, %r14883; + add.s32 %r15064, %r15064, %r14883; + add.s32 %r15065, %r15065, %r14883; + add.s32 %r15066, %r15066, %r14883; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r1424, %r15052, 1; + .loc 1 788 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:788:33 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shr.u32 %r10694, %r1424, 1; + .loc 1 789 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mad.wide.u32 %rd759, %r10694, 4, %rd559; + .loc 1 790 109 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:109 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10695, %r10694, 1; + .loc 1 790 113 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:113 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p950, %r10695, %r7561; + .loc 1 790 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:55 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd762, %rd759, 4; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.pred %p642, %p641, %p950; + .loc 1 791 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:791:35 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + and.b32 %r10696, %r15052, 1; + .loc 1 793 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + xor.b32 %r10697, %r10696, 1; + .loc 1 793 61 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:61 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10698, %r10696, 6; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + ld.shared.v2.b32 {%r10699, %r10700}, [%r10643]; + .loc 1 750 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:750:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd738, %rd734, 0, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd740, %rd741, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd742, %rd743, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd744, %rd745, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd746, %rd747, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd748, %rd749, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd750, %rd751, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd752, %rd753, %p619, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r9499, %r8449; + mov.b32 %r9495, %r8449; + mov.b32 %r9496, %r8449; + mov.b32 %r9498, %r8449; + // begin inline asm + // wait for regs: %r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109,%r9494,%r9495,%r9496,%r9497,%r9498,%r9499 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:751:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.f32 %r10701, %r9080, %r10699; + sub.f32 %r10702, %r9081, %r10700; + sub.f32 %r10703, %r9082, %r10644; + sub.f32 %r10704, %r9083, %r10645; + sub.f32 %r10705, %r9084, %r10644; + sub.f32 %r10706, %r9085, %r10645; + sub.f32 %r10707, %r9086, %r10646; + sub.f32 %r10708, %r9087, %r10647; + sub.f32 %r10709, %r9088, %r10646; + sub.f32 %r10710, %r9089, %r10647; + sub.f32 %r10711, %r9090, %r10648; + sub.f32 %r10712, %r9091, %r10649; + sub.f32 %r10713, %r9092, %r10648; + sub.f32 %r10714, %r9093, %r10649; + sub.f32 %r10715, %r9094, %r10650; + sub.f32 %r10716, %r9095, %r10651; + sub.f32 %r10717, %r9096, %r10650; + sub.f32 %r10718, %r9097, %r10651; + sub.f32 %r10719, %r9098, %r10652; + sub.f32 %r10720, %r9099, %r10653; + sub.f32 %r10721, %r9100, %r10652; + sub.f32 %r10722, %r9101, %r10653; + sub.f32 %r10723, %r9102, %r10654; + sub.f32 %r10724, %r9103, %r10655; + sub.f32 %r10725, %r9104, %r10654; + sub.f32 %r10726, %r9105, %r10655; + sub.f32 %r10727, %r9106, %r10656; + sub.f32 %r10728, %r9107, %r10657; + sub.f32 %r10729, %r9108, %r10656; + sub.f32 %r10730, %r9109, %r10657; + .loc 1 751 16 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:751:16 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10731, %r10604, %r10702; + mul.f32 %r10732, %r10603, %r10701; + mul.f32 %r10733, %r10605, %r10703; + mul.f32 %r10734, %r10606, %r10704; + mul.f32 %r10735, %r10607, %r10705; + mul.f32 %r10736, %r10608, %r10706; + mul.f32 %r10737, %r10609, %r10707; + mul.f32 %r10738, %r10610, %r10708; + mul.f32 %r10739, %r10611, %r10709; + mul.f32 %r10740, %r10612, %r10710; + mul.f32 %r10741, %r10613, %r10711; + mul.f32 %r10742, %r10614, %r10712; + mul.f32 %r10743, %r10615, %r10713; + mul.f32 %r10744, %r10616, %r10714; + mul.f32 %r10745, %r10617, %r10715; + mul.f32 %r10746, %r10618, %r10716; + mul.f32 %r10747, %r10619, %r10717; + mul.f32 %r10748, %r10620, %r10718; + mul.f32 %r10749, %r10621, %r10719; + mul.f32 %r10750, %r10622, %r10720; + mul.f32 %r10751, %r10623, %r10721; + mul.f32 %r10752, %r10624, %r10722; + mul.f32 %r10753, %r10625, %r10723; + mul.f32 %r10754, %r10626, %r10724; + mul.f32 %r10755, %r10627, %r10725; + mul.f32 %r10756, %r10628, %r10726; + mul.f32 %r10757, %r10629, %r10727; + mul.f32 %r10758, %r10630, %r10728; + mul.f32 %r10759, %r10631, %r10729; + mul.f32 %r10760, %r10632, %r10730; + .loc 1 751 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:751:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.f32 %r10761, %r9078, %r10699; + sub.f32 %r10762, %r9079, %r10700; + .loc 1 751 16 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:751:16 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.f32 %r10763, %r10602, %r10762; + mul.f32 %r10764, %r10601, %r10761; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs129, %r10764; + cvt.rn.bf16.f32 %rs130, %r10763; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs131, %rs130, 0x0000, %p918; + selp.b16 %rs132, %rs129, 0x0000, %p919; + mov.b32 %r9666, {%rs132, %rs131}; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs133, %r10732; + cvt.rn.bf16.f32 %rs134, %r10731; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs135, %rs134, 0x0000, %p920; + selp.b16 %rs136, %rs133, 0x0000, %p921; + mov.b32 %r9667, {%rs136, %rs135}; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs137, %r10733; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs138, %rs137, 0x0000, %p922; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs139, %r10734; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs140, %rs139, 0x0000, %p923; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs141, %r10735; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs142, %rs141, 0x0000, %p924; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs143, %r10736; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs144, %rs143, 0x0000, %p925; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs145, %r10737; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs146, %rs145, 0x0000, %p926; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs147, %r10738; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs148, %rs147, 0x0000, %p927; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs149, %r10739; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs150, %rs149, 0x0000, %p928; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs151, %r10740; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs152, %rs151, 0x0000, %p929; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs153, %r10741; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs154, %rs153, 0x0000, %p930; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs155, %r10742; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs156, %rs155, 0x0000, %p931; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs157, %r10743; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs158, %rs157, 0x0000, %p932; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs159, %r10744; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs160, %rs159, 0x0000, %p933; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs161, %r10745; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs162, %rs161, 0x0000, %p934; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs163, %r10746; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs164, %rs163, 0x0000, %p935; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs165, %r10747; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs166, %rs165, 0x0000, %p936; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs167, %r10748; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs168, %rs167, 0x0000, %p937; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs169, %r10749; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs170, %rs169, 0x0000, %p938; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs171, %r10750; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs172, %rs171, 0x0000, %p939; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs173, %r10751; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs174, %rs173, 0x0000, %p940; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs175, %r10752; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs176, %rs175, 0x0000, %p941; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs177, %r10753; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs178, %rs177, 0x0000, %p942; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs179, %r10754; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs180, %rs179, 0x0000, %p943; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs181, %r10755; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs182, %rs181, 0x0000, %p944; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs183, %r10756; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs184, %rs183, 0x0000, %p945; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs185, %r10757; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs186, %rs185, 0x0000, %p946; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs187, %r10758; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs188, %rs187, 0x0000, %p947; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs189, %r10759; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs190, %rs189, 0x0000, %p948; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + cvt.rn.bf16.f32 %rs191, %r10760; + .loc 1 773 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:773:45 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + selp.b16 %rs192, %rs191, 0x0000, %p949; + .loc 1 775 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mov.b32 %r9668, {%rs138, %rs140}; + mov.b32 %r9669, {%rs142, %rs144}; + mov.b32 %r9798, {%rs146, %rs148}; + mov.b32 %r9799, {%rs150, %rs152}; + mov.b32 %r9800, {%rs154, %rs156}; + mov.b32 %r9801, {%rs158, %rs160}; + mov.b32 %r9930, {%rs162, %rs164}; + mov.b32 %r9931, {%rs166, %rs168}; + mov.b32 %r9932, {%rs170, %rs172}; + mov.b32 %r9933, {%rs174, %rs176}; + mov.b32 %r10062, {%rs178, %rs180}; + mov.b32 %r10063, {%rs182, %rs184}; + mov.b32 %r10064, {%rs186, %rs188}; + mov.b32 %r10065, {%rs190, %rs192}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r9666,%r9667,%r9668,%r9669}, %rd719, %p619, 1, 1, 1; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r9798,%r9799,%r9800,%r9801}, %rd755, %p619, 1, 1, 1; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r9930,%r9931,%r9932,%r9933}, %rd756, %p619, 1, 1, 1; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r10062,%r10063,%r10064,%r10065}, %rd757, %p619, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 789 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + mov.u64 %rd758, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd758, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10066, 0x0; + @%p641 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10066 }, [ %rd759 + 0 ], %rd758; + // end inline asm + .loc 1 790 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + mov.u64 %rd761, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd761, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10067, 0x0; + @%p642 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10067 }, [ %rd762 + 0 ], %rd761; + // end inline asm + .loc 1 792 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:34 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + sub.s32 %r10765, %r10067, %r10066; + .loc 1 792 48 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:48 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10766, %r10765, 7; + .loc 1 792 63 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:63 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10767, %r10766, -64; + .loc 1 793 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:42 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mad.lo.s32 %r14883, %r10767, %r10697, %r10698; + .loc 1 626 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:626:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10768, %r14883, 12; + .loc 1 626 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:626:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.wide.s32 %rd858, %r10768, 2; + add.s64 %rd1201, %rd1201, %rd858; + add.s64 %rd1200, %rd1200, %rd858; + add.s64 %rd1199, %rd1199, %rd858; + add.s64 %rd1198, %rd1198, %rd858; + .loc 1 627 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:627:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10769, %r14883, 7; + .loc 1 627 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:627:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.wide.s32 %rd859, %r10769, 2; + add.s64 %rd1197, %rd1197, %rd859; + add.s64 %rd1196, %rd1196, %rd859; + add.s64 %rd1195, %rd1195, %rd859; + add.s64 %rd1194, %rd1194, %rd859; + .loc 1 628 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:628:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r1491, %r14883, %r15058; + add.s32 %r1490, %r14883, %r15057; + add.s32 %r1492, %r14883, %r14915; + add.s32 %r1493, %r14883, %r14914; + add.s32 %r1494, %r14883, %r14913; + add.s32 %r1495, %r14883, %r14912; + add.s32 %r1496, %r14883, %r14911; + add.s32 %r1497, %r14883, %r14910; + add.s32 %r1498, %r14883, %r14909; + add.s32 %r1499, %r14883, %r14908; + add.s32 %r1500, %r14883, %r14907; + add.s32 %r1501, %r14883, %r14906; + add.s32 %r1502, %r14883, %r14905; + add.s32 %r1503, %r14883, %r14904; + add.s32 %r1504, %r14883, %r14903; + add.s32 %r1505, %r14883, %r14902; + add.s32 %r14920, %r14883, %r14920; + add.s32 %r14921, %r14883, %r14921; + add.s32 %r14922, %r14883, %r14922; + add.s32 %r14923, %r14883, %r14923; + add.s32 %r14916, %r14883, %r14916; + add.s32 %r14917, %r14883, %r14917; + add.s32 %r14918, %r14883, %r14918; + add.s32 %r14919, %r14883, %r14919; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10770, %r14899, 1; + setp.gt.s32 %p951, %r10770, 1; + selp.b32 %r14899, 0, %r10770, %p951; + add.s32 %r10771, %r14901, 1; + setp.gt.s32 %p952, %r10771, 2; + selp.b32 %r14901, 0, %r10771, %p952; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p953, %r14920, %r2358; + setp.lt.s32 %p954, %r14921, %r2358; + setp.lt.s32 %p955, %r14922, %r2358; + setp.lt.s32 %p956, %r14923, %r2358; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10772, %r14901, 14; + add.s32 %r10773, %r7585, %r10772; + bar.sync 0; + add.s32 %r10068, %r10773, %r745; + selp.b32 %r10774, 16, 0, %p953; + selp.b32 %r10069, %r10774, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10068 + 0 ], [ %rd1201 + 0 ], 0x10, %r10069; + // end inline asm + add.s32 %r10070, %r10068, 2048; + selp.b32 %r10775, 16, 0, %p954; + selp.b32 %r10071, %r10775, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10070 + 0 ], [ %rd1200 + 0 ], 0x10, %r10071; + // end inline asm + add.s32 %r10072, %r10068, 4096; + selp.b32 %r10776, 16, 0, %p955; + selp.b32 %r10073, %r10776, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10072 + 0 ], [ %rd1199 + 0 ], 0x10, %r10073; + // end inline asm + add.s32 %r10074, %r10068, 6144; + selp.b32 %r10777, 16, 0, %p956; + selp.b32 %r10075, %r10777, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10074 + 0 ], [ %rd1198 + 0 ], 0x10, %r10075; + // end inline asm + cp.async.commit_group; + .loc 1 674 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p957, %r1490, %r2358; + setp.lt.s32 %p958, %r1491, %r2358; + setp.lt.s32 %p959, %r1492, %r2358; + setp.lt.s32 %p960, %r1493, %r2358; + setp.lt.s32 %p961, %r1494, %r2358; + setp.lt.s32 %p962, %r1495, %r2358; + setp.lt.s32 %p963, %r1496, %r2358; + setp.lt.s32 %p964, %r1497, %r2358; + setp.lt.s32 %p965, %r1498, %r2358; + setp.lt.s32 %p966, %r1499, %r2358; + setp.lt.s32 %p967, %r1500, %r2358; + setp.lt.s32 %p968, %r1501, %r2358; + setp.lt.s32 %p969, %r1502, %r2358; + setp.lt.s32 %p970, %r1503, %r2358; + setp.lt.s32 %p971, %r1504, %r2358; + setp.lt.s32 %p972, %r1505, %r2358; + .loc 1 674 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + mul.wide.s32 %rd860, %r1490, 4; + add.s64 %rd768, %rd142, %rd860; + mul.wide.s32 %rd861, %r1491, 4; + add.s64 %rd769, %rd142, %rd861; + mul.wide.s32 %rd862, %r1492, 4; + add.s64 %rd770, %rd142, %rd862; + mul.wide.s32 %rd863, %r1493, 4; + add.s64 %rd771, %rd142, %rd863; + mul.wide.s32 %rd864, %r1494, 4; + add.s64 %rd772, %rd142, %rd864; + mul.wide.s32 %rd865, %r1495, 4; + add.s64 %rd773, %rd142, %rd865; + mul.wide.s32 %rd866, %r1496, 4; + add.s64 %rd774, %rd142, %rd866; + mul.wide.s32 %rd867, %r1497, 4; + add.s64 %rd775, %rd142, %rd867; + mul.wide.s32 %rd868, %r1498, 4; + add.s64 %rd776, %rd142, %rd868; + mul.wide.s32 %rd869, %r1499, 4; + add.s64 %rd777, %rd142, %rd869; + mul.wide.s32 %rd870, %r1500, 4; + add.s64 %rd778, %rd142, %rd870; + mul.wide.s32 %rd871, %r1501, 4; + add.s64 %rd779, %rd142, %rd871; + mul.wide.s32 %rd872, %r1502, 4; + add.s64 %rd780, %rd142, %rd872; + mul.wide.s32 %rd873, %r1503, 4; + add.s64 %rd781, %rd142, %rd873; + mul.wide.s32 %rd874, %r1504, 4; + add.s64 %rd782, %rd142, %rd874; + mul.wide.s32 %rd875, %r1505, 4; + add.s64 %rd783, %rd142, %rd875; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + shl.b32 %r10778, %r14899, 8; + add.s32 %r10779, %r10153, %r10778; + add.s32 %r10076, %r10779, %r767; + selp.b32 %r10780, 4, 0, %p957; + selp.b32 %r10117, %r10780, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10076 + 0 ], [ %rd768 + 0 ], 0x4, %r10117; + // end inline asm + add.s32 %r10078, %r10076, 4; + selp.b32 %r10781, 4, 0, %p958; + selp.b32 %r10119, %r10781, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10078 + 0 ], [ %rd769 + 0 ], 0x4, %r10119; + // end inline asm + add.s32 %r10080, %r10076, 32; + selp.b32 %r10782, 4, 0, %p959; + selp.b32 %r10121, %r10782, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10080 + 0 ], [ %rd770 + 0 ], 0x4, %r10121; + // end inline asm + add.s32 %r10082, %r10076, 36; + selp.b32 %r10783, 4, 0, %p960; + selp.b32 %r10123, %r10783, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10082 + 0 ], [ %rd771 + 0 ], 0x4, %r10123; + // end inline asm + add.s32 %r10084, %r10076, 64; + selp.b32 %r10784, 4, 0, %p961; + selp.b32 %r10125, %r10784, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10084 + 0 ], [ %rd772 + 0 ], 0x4, %r10125; + // end inline asm + add.s32 %r10086, %r10076, 68; + selp.b32 %r10785, 4, 0, %p962; + selp.b32 %r10127, %r10785, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10086 + 0 ], [ %rd773 + 0 ], 0x4, %r10127; + // end inline asm + add.s32 %r10088, %r10076, 96; + selp.b32 %r10786, 4, 0, %p963; + selp.b32 %r10129, %r10786, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10088 + 0 ], [ %rd774 + 0 ], 0x4, %r10129; + // end inline asm + add.s32 %r10090, %r10076, 100; + selp.b32 %r10787, 4, 0, %p964; + selp.b32 %r10131, %r10787, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10090 + 0 ], [ %rd775 + 0 ], 0x4, %r10131; + // end inline asm + add.s32 %r10092, %r10076, 128; + selp.b32 %r10788, 4, 0, %p965; + selp.b32 %r10133, %r10788, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10092 + 0 ], [ %rd776 + 0 ], 0x4, %r10133; + // end inline asm + add.s32 %r10094, %r10076, 132; + selp.b32 %r10789, 4, 0, %p966; + selp.b32 %r10135, %r10789, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10094 + 0 ], [ %rd777 + 0 ], 0x4, %r10135; + // end inline asm + add.s32 %r10096, %r10076, 160; + selp.b32 %r10790, 4, 0, %p967; + selp.b32 %r10137, %r10790, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10096 + 0 ], [ %rd778 + 0 ], 0x4, %r10137; + // end inline asm + add.s32 %r10098, %r10076, 164; + selp.b32 %r10791, 4, 0, %p968; + selp.b32 %r10139, %r10791, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10098 + 0 ], [ %rd779 + 0 ], 0x4, %r10139; + // end inline asm + add.s32 %r10100, %r10076, 192; + selp.b32 %r10792, 4, 0, %p969; + selp.b32 %r10141, %r10792, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10100 + 0 ], [ %rd780 + 0 ], 0x4, %r10141; + // end inline asm + add.s32 %r10102, %r10076, 196; + selp.b32 %r10793, 4, 0, %p970; + selp.b32 %r10143, %r10793, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10102 + 0 ], [ %rd781 + 0 ], 0x4, %r10143; + // end inline asm + add.s32 %r10104, %r10076, 224; + selp.b32 %r10794, 4, 0, %p971; + selp.b32 %r10145, %r10794, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10104 + 0 ], [ %rd782 + 0 ], 0x4, %r10145; + // end inline asm + add.s32 %r10106, %r10076, 228; + selp.b32 %r10795, 4, 0, %p972; + selp.b32 %r10147, %r10795, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10106 + 0 ], [ %rd783 + 0 ], 0x4, %r10147; + // end inline asm + cp.async.commit_group; + .loc 1 833 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.lt.s32 %p973, %r14916, %r2358; + setp.lt.s32 %p974, %r14917, %r2358; + setp.lt.s32 %p975, %r14918, %r2358; + setp.lt.s32 %p976, %r14919, %r2358; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10796, %r10633, %r10772; + add.s32 %r10108, %r10796, %r745; + selp.b32 %r10797, 16, 0, %p973; + selp.b32 %r10109, %r10797, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10108 + 0 ], [ %rd1197 + 0 ], 0x10, %r10109; + // end inline asm + add.s32 %r10110, %r10108, 2048; + selp.b32 %r10798, 16, 0, %p974; + selp.b32 %r10111, %r10798, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10110 + 0 ], [ %rd1196 + 0 ], 0x10, %r10111; + // end inline asm + add.s32 %r10112, %r10108, 4096; + selp.b32 %r10799, 16, 0, %p975; + selp.b32 %r10113, %r10799, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10112 + 0 ], [ %rd1195 + 0 ], 0x10, %r10113; + // end inline asm + add.s32 %r10114, %r10108, 6144; + selp.b32 %r10800, 16, 0, %p976; + selp.b32 %r10115, %r10800, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10114 + 0 ], [ %rd1194 + 0 ], 0x10, %r10115; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s64 %rd788, %rd143, %rd860; + add.s64 %rd789, %rd143, %rd861; + add.s64 %rd790, %rd143, %rd862; + add.s64 %rd791, %rd143, %rd863; + add.s64 %rd792, %rd143, %rd864; + add.s64 %rd793, %rd143, %rd865; + add.s64 %rd794, %rd143, %rd866; + add.s64 %rd795, %rd143, %rd867; + add.s64 %rd796, %rd143, %rd868; + add.s64 %rd797, %rd143, %rd869; + add.s64 %rd798, %rd143, %rd870; + add.s64 %rd799, %rd143, %rd871; + add.s64 %rd800, %rd143, %rd872; + add.s64 %rd801, %rd143, %rd873; + add.s64 %rd802, %rd143, %rd874; + add.s64 %rd803, %rd143, %rd875; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + add.s32 %r10801, %r10641, %r10778; + add.s32 %r10116, %r10801, %r767; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10116 + 0 ], [ %rd788 + 0 ], 0x4, %r10117; + // end inline asm + add.s32 %r10118, %r10116, 4; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10118 + 0 ], [ %rd789 + 0 ], 0x4, %r10119; + // end inline asm + add.s32 %r10120, %r10116, 32; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10120 + 0 ], [ %rd790 + 0 ], 0x4, %r10121; + // end inline asm + add.s32 %r10122, %r10116, 36; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10122 + 0 ], [ %rd791 + 0 ], 0x4, %r10123; + // end inline asm + add.s32 %r10124, %r10116, 64; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10124 + 0 ], [ %rd792 + 0 ], 0x4, %r10125; + // end inline asm + add.s32 %r10126, %r10116, 68; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10126 + 0 ], [ %rd793 + 0 ], 0x4, %r10127; + // end inline asm + add.s32 %r10128, %r10116, 96; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10128 + 0 ], [ %rd794 + 0 ], 0x4, %r10129; + // end inline asm + add.s32 %r10130, %r10116, 100; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10130 + 0 ], [ %rd795 + 0 ], 0x4, %r10131; + // end inline asm + add.s32 %r10132, %r10116, 128; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10132 + 0 ], [ %rd796 + 0 ], 0x4, %r10133; + // end inline asm + add.s32 %r10134, %r10116, 132; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10134 + 0 ], [ %rd797 + 0 ], 0x4, %r10135; + // end inline asm + add.s32 %r10136, %r10116, 160; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10136 + 0 ], [ %rd798 + 0 ], 0x4, %r10137; + // end inline asm + add.s32 %r10138, %r10116, 164; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10138 + 0 ], [ %rd799 + 0 ], 0x4, %r10139; + // end inline asm + add.s32 %r10140, %r10116, 192; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10140 + 0 ], [ %rd800 + 0 ], 0x4, %r10141; + // end inline asm + add.s32 %r10142, %r10116, 196; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10142 + 0 ], [ %rd801 + 0 ], 0x4, %r10143; + // end inline asm + add.s32 %r10144, %r10116, 224; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10144 + 0 ], [ %rd802 + 0 ], 0x4, %r10145; + // end inline asm + add.s32 %r10146, %r10116, 228; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10146 + 0 ], [ %rd803 + 0 ], 0x4, %r10147; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + setp.ne.b32 %p977, %r966, %r1424; + mov.b32 %r14884, %r14915; + mov.b32 %r14885, %r14914; + mov.b32 %r14886, %r14913; + mov.b32 %r14887, %r14912; + mov.b32 %r14888, %r14911; + mov.b32 %r14889, %r14910; + mov.b32 %r14890, %r14909; + mov.b32 %r14891, %r14908; + mov.b32 %r14892, %r14907; + mov.b32 %r14893, %r14906; + mov.b32 %r14894, %r14905; + mov.b32 %r14895, %r14904; + mov.b32 %r14896, %r14903; + mov.b32 %r14897, %r14902; + mov.b32 %r14902, %r1505; + mov.b32 %r14903, %r1504; + mov.b32 %r14904, %r1503; + mov.b32 %r14905, %r1502; + mov.b32 %r14906, %r1501; + mov.b32 %r14907, %r1500; + mov.b32 %r14908, %r1499; + mov.b32 %r14909, %r1498; + mov.b32 %r14910, %r1497; + mov.b32 %r14911, %r1496; + mov.b32 %r14912, %r1495; + mov.b32 %r14913, %r1494; + mov.b32 %r14914, %r1493; + mov.b32 %r14915, %r1492; + mov.b32 %r15052, %r1424; + mov.b32 %r15055, %r15057; + mov.b32 %r15056, %r15058; + mov.b32 %r15057, %r1490; + mov.b32 %r15058, %r1491; + @%p977 bra $L__BB0_11; +$L__BB0_12: // %._crit_edge1621 + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0:28 + setp.lt.s32 %p1042, %r744, 1; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:298:16 ] + // begin inline asm + // wait for regs: %r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987,%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp18: + .loc 1 601 18 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:601:18 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd957, %rd140, %rd956; + add.s64 %rd959, %rd140, %rd958; + add.s64 %rd961, %rd140, %rd960; + add.s64 %rd963, %rd140, %rd962; + .loc 1 601 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:601:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd876, %rd957, %rd685; + add.s64 %rd877, %rd959, %rd685; + add.s64 %rd878, %rd961, %rd685; + add.s64 %rd879, %rd963, %rd685; + .loc 1 602 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:602:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd966, %rd141, %rd965; + add.s64 %rd968, %rd141, %rd967; + add.s64 %rd970, %rd141, %rd969; + add.s64 %rd972, %rd141, %rd971; + .loc 1 602 51 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:602:51 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd896, %rd966, %rd685; + add.s64 %rd897, %rd968, %rd685; + add.s64 %rd898, %rd970, %rd685; + add.s64 %rd899, %rd972, %rd685; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r11058 + 0 ], [ %rd876 + 0 ], 0x10, %r11059; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11060 + 0 ], [ %rd877 + 0 ], 0x10, %r11061; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11062 + 0 ], [ %rd878 + 0 ], 0x10, %r11063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11064 + 0 ], [ %rd879 + 0 ], 0x10, %r11065; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd880, %rd142, %rd973; + cvt.s64.s32 %rd974, %r726; + add.s64 %rd975, %rd974, %rd144; + shl.b64 %rd976, %rd975, 2; + add.s64 %rd977, %rd142, %rd976; + add.s64 %rd881, %rd977, 4; + add.s64 %rd882, %rd977, 32; + add.s64 %rd883, %rd977, 36; + add.s64 %rd884, %rd142, %rd978; + add.s64 %rd885, %rd142, %rd979; + add.s64 %rd886, %rd142, %rd980; + add.s64 %rd887, %rd142, %rd981; + add.s64 %rd888, %rd142, %rd982; + add.s64 %rd889, %rd142, %rd983; + add.s64 %rd890, %rd142, %rd984; + add.s64 %rd891, %rd142, %rd985; + add.s64 %rd892, %rd142, %rd986; + add.s64 %rd893, %rd142, %rd987; + add.s64 %rd894, %rd142, %rd988; + add.s64 %rd895, %rd142, %rd989; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11066 + 0 ], [ %rd880 + 0 ], 0x4, %r11067; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11068 + 0 ], [ %rd881 + 0 ], 0x4, %r11069; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11070 + 0 ], [ %rd882 + 0 ], 0x4, %r11071; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11072 + 0 ], [ %rd883 + 0 ], 0x4, %r11073; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11074 + 0 ], [ %rd884 + 0 ], 0x4, %r11075; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11076 + 0 ], [ %rd885 + 0 ], 0x4, %r11077; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11078 + 0 ], [ %rd886 + 0 ], 0x4, %r11079; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11080 + 0 ], [ %rd887 + 0 ], 0x4, %r11081; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11082 + 0 ], [ %rd888 + 0 ], 0x4, %r11083; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11084 + 0 ], [ %rd889 + 0 ], 0x4, %r11085; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11086 + 0 ], [ %rd890 + 0 ], 0x4, %r11087; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11088 + 0 ], [ %rd891 + 0 ], 0x4, %r11089; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11090 + 0 ], [ %rd892 + 0 ], 0x4, %r11091; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11092 + 0 ], [ %rd893 + 0 ], 0x4, %r11093; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11094 + 0 ], [ %rd894 + 0 ], 0x4, %r11095; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11096 + 0 ], [ %rd895 + 0 ], 0x4, %r11097; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r11098 + 0 ], [ %rd896 + 0 ], 0x10, %r11059; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11100 + 0 ], [ %rd897 + 0 ], 0x10, %r11061; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11102 + 0 ], [ %rd898 + 0 ], 0x10, %r11063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11104 + 0 ], [ %rd899 + 0 ], 0x10, %r11065; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd900, %rd143, %rd973; + add.s64 %rd990, %rd143, %rd976; + add.s64 %rd901, %rd990, 4; + add.s64 %rd902, %rd990, 32; + add.s64 %rd903, %rd990, 36; + add.s64 %rd904, %rd143, %rd978; + add.s64 %rd905, %rd143, %rd979; + add.s64 %rd906, %rd143, %rd980; + add.s64 %rd907, %rd143, %rd981; + add.s64 %rd908, %rd143, %rd982; + add.s64 %rd909, %rd143, %rd983; + add.s64 %rd910, %rd143, %rd984; + add.s64 %rd911, %rd143, %rd985; + add.s64 %rd912, %rd143, %rd986; + add.s64 %rd913, %rd143, %rd987; + add.s64 %rd914, %rd143, %rd988; + add.s64 %rd915, %rd143, %rd989; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11106 + 0 ], [ %rd900 + 0 ], 0x4, %r11067; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11108 + 0 ], [ %rd901 + 0 ], 0x4, %r11069; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11110 + 0 ], [ %rd902 + 0 ], 0x4, %r11071; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11112 + 0 ], [ %rd903 + 0 ], 0x4, %r11073; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11114 + 0 ], [ %rd904 + 0 ], 0x4, %r11075; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11116 + 0 ], [ %rd905 + 0 ], 0x4, %r11077; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11118 + 0 ], [ %rd906 + 0 ], 0x4, %r11079; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11120 + 0 ], [ %rd907 + 0 ], 0x4, %r11081; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11122 + 0 ], [ %rd908 + 0 ], 0x4, %r11083; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11124 + 0 ], [ %rd909 + 0 ], 0x4, %r11085; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11126 + 0 ], [ %rd910 + 0 ], 0x4, %r11087; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11128 + 0 ], [ %rd911 + 0 ], 0x4, %r11089; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11130 + 0 ], [ %rd912 + 0 ], 0x4, %r11091; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11132 + 0 ], [ %rd913 + 0 ], 0x4, %r11093; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11134 + 0 ], [ %rd914 + 0 ], 0x4, %r11095; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11136 + 0 ], [ %rd915 + 0 ], 0x4, %r11097; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:626:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd1209, %rd876, 524288; + add.s64 %rd1208, %rd877, 524288; + add.s64 %rd1207, %rd878, 524288; + add.s64 %rd1206, %rd879, 524288; + .loc 1 627 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:627:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd1205, %rd896, 16384; + add.s64 %rd1204, %rd897, 16384; + add.s64 %rd1203, %rd898, 16384; + add.s64 %rd1202, %rd899, 16384; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r11138 + 0 ], [ %rd1209 + 0 ], 0x10, %r11139; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11140 + 0 ], [ %rd1208 + 0 ], 0x10, %r11141; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11142 + 0 ], [ %rd1207 + 0 ], 0x10, %r11143; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11144 + 0 ], [ %rd1206 + 0 ], 0x10, %r11145; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd920, %rd880, 256; + add.s64 %rd991, %rd976, 256; + add.s64 %rd992, %rd142, %rd991; + add.s64 %rd921, %rd992, 4; + add.s64 %rd922, %rd992, 32; + add.s64 %rd923, %rd992, 36; + add.s64 %rd924, %rd884, 256; + add.s64 %rd925, %rd885, 256; + add.s64 %rd926, %rd886, 256; + add.s64 %rd927, %rd887, 256; + add.s64 %rd928, %rd888, 256; + add.s64 %rd929, %rd889, 256; + add.s64 %rd930, %rd890, 256; + add.s64 %rd931, %rd891, 256; + add.s64 %rd932, %rd892, 256; + add.s64 %rd933, %rd893, 256; + add.s64 %rd934, %rd894, 256; + add.s64 %rd935, %rd895, 256; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11146 + 0 ], [ %rd920 + 0 ], 0x4, %r11147; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11148 + 0 ], [ %rd921 + 0 ], 0x4, %r11149; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11150 + 0 ], [ %rd922 + 0 ], 0x4, %r11151; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11152 + 0 ], [ %rd923 + 0 ], 0x4, %r11153; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11154 + 0 ], [ %rd924 + 0 ], 0x4, %r11155; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11156 + 0 ], [ %rd925 + 0 ], 0x4, %r11157; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11158 + 0 ], [ %rd926 + 0 ], 0x4, %r11159; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11160 + 0 ], [ %rd927 + 0 ], 0x4, %r11161; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11162 + 0 ], [ %rd928 + 0 ], 0x4, %r11163; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11164 + 0 ], [ %rd929 + 0 ], 0x4, %r11165; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11166 + 0 ], [ %rd930 + 0 ], 0x4, %r11167; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11168 + 0 ], [ %rd931 + 0 ], 0x4, %r11169; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11170 + 0 ], [ %rd932 + 0 ], 0x4, %r11171; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11172 + 0 ], [ %rd933 + 0 ], 0x4, %r11173; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11174 + 0 ], [ %rd934 + 0 ], 0x4, %r11175; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11176 + 0 ], [ %rd935 + 0 ], 0x4, %r11177; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r11178 + 0 ], [ %rd1205 + 0 ], 0x10, %r11139; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11180 + 0 ], [ %rd1204 + 0 ], 0x10, %r11141; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11182 + 0 ], [ %rd1203 + 0 ], 0x10, %r11143; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11184 + 0 ], [ %rd1202 + 0 ], 0x10, %r11145; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd940, %rd900, 256; + add.s64 %rd993, %rd143, %rd991; + add.s64 %rd941, %rd993, 4; + add.s64 %rd942, %rd993, 32; + add.s64 %rd943, %rd993, 36; + add.s64 %rd944, %rd904, 256; + add.s64 %rd945, %rd905, 256; + add.s64 %rd946, %rd906, 256; + add.s64 %rd947, %rd907, 256; + add.s64 %rd948, %rd908, 256; + add.s64 %rd949, %rd909, 256; + add.s64 %rd950, %rd910, 256; + add.s64 %rd951, %rd911, 256; + add.s64 %rd952, %rd912, 256; + add.s64 %rd953, %rd913, 256; + add.s64 %rd954, %rd914, 256; + add.s64 %rd955, %rd915, 256; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11186 + 0 ], [ %rd940 + 0 ], 0x4, %r11147; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11188 + 0 ], [ %rd941 + 0 ], 0x4, %r11149; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11190 + 0 ], [ %rd942 + 0 ], 0x4, %r11151; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11192 + 0 ], [ %rd943 + 0 ], 0x4, %r11153; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11194 + 0 ], [ %rd944 + 0 ], 0x4, %r11155; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11196 + 0 ], [ %rd945 + 0 ], 0x4, %r11157; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11198 + 0 ], [ %rd946 + 0 ], 0x4, %r11159; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11200 + 0 ], [ %rd947 + 0 ], 0x4, %r11161; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11202 + 0 ], [ %rd948 + 0 ], 0x4, %r11163; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11204 + 0 ], [ %rd949 + 0 ], 0x4, %r11165; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11206 + 0 ], [ %rd950 + 0 ], 0x4, %r11167; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11208 + 0 ], [ %rd951 + 0 ], 0x4, %r11169; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11210 + 0 ], [ %rd952 + 0 ], 0x4, %r11171; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11212 + 0 ], [ %rd953 + 0 ], 0x4, %r11173; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11214 + 0 ], [ %rd954 + 0 ], 0x4, %r11175; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11216 + 0 ], [ %rd955 + 0 ], 0x4, %r11177; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + @%p1042 bra $L__BB0_15; +// %bb.13: // %.lr.ph1793.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0:28 + mov.b32 %r11772, 0; + mov.b32 %r15216, 1; + mov.b32 %r15215, -1; + mov.b32 %r15199, %r743; + mov.b32 %r15200, %r742; + mov.b32 %r15201, %r741; + mov.b32 %r15202, %r740; + mov.b32 %r15203, %r739; + mov.b32 %r15204, %r738; + mov.b32 %r15205, %r737; + mov.b32 %r15206, %r736; + mov.b32 %r15207, %r735; + mov.b32 %r15208, %r734; + mov.b32 %r15209, %r733; + mov.b32 %r15210, %r732; + mov.b32 %r15211, %r731; + mov.b32 %r15212, %r730; + mov.b32 %r15213, %r729; + mov.b32 %r15214, %r728; + mov.b32 %r15217, %r15215; + mov.b32 %r15218, %r15216; + mov.b32 %r15219, %r939; + mov.b32 %r15220, %r938; + mov.b32 %r15221, %r937; + mov.b32 %r15222, %r936; + mov.b32 %r15223, %r935; + mov.b32 %r15224, %r934; + mov.b32 %r15225, %r933; + mov.b32 %r15226, %r932; + mov.b32 %r15227, %r931; + mov.b32 %r15228, %r930; + mov.b32 %r15229, %r929; + mov.b32 %r15230, %r928; + mov.b32 %r15231, %r927; + mov.b32 %r15232, %r926; + mov.b32 %r15233, %r925; + mov.b32 %r15234, %r924; + mov.b32 %r15235, %r940; + mov.b32 %r15236, %r941; + mov.b32 %r15237, %r942; + mov.b32 %r15238, %r943; + mov.b32 %r15239, %r940; + mov.b32 %r15240, %r941; + mov.b32 %r15241, %r942; + mov.b32 %r15242, %r943; + mov.b32 %r15371, %r11772; +$L__BB0_14: // %.lr.ph1793 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.lt.s32 %p1099, %r15371, %r964; + setp.lt.s32 %p1065, %r15371, %r965; + add.s32 %r13471, %r15215, 1; + setp.gt.s32 %p1100, %r13471, 1; + selp.b32 %r15215, 0, %r13471, %p1100; + add.s32 %r13472, %r15217, 1; + setp.gt.s32 %p1101, %r13472, 2; + selp.b32 %r15217, 0, %r13472, %p1101; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.lt.s32 %p1102, %r15214, %r2358; + setp.lt.s32 %p1103, %r15213, %r2358; + setp.lt.s32 %p1104, %r15212, %r2358; + setp.lt.s32 %p1105, %r15211, %r2358; + setp.lt.s32 %p1106, %r15210, %r2358; + setp.lt.s32 %p1107, %r15209, %r2358; + setp.lt.s32 %p1108, %r15208, %r2358; + setp.lt.s32 %p1109, %r15207, %r2358; + setp.lt.s32 %p1110, %r15206, %r2358; + setp.lt.s32 %p1111, %r15205, %r2358; + setp.lt.s32 %p1112, %r15204, %r2358; + setp.lt.s32 %p1113, %r15203, %r2358; + setp.lt.s32 %p1114, %r15202, %r2358; + setp.lt.s32 %p1115, %r15201, %r2358; + setp.lt.s32 %p1116, %r15200, %r2358; + setp.lt.s32 %p1117, %r15199, %r2358; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r13473, %r15217, 14; + add.s32 %r11774, %r7585, %r13473; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13475, %r15215, 8; + add.s32 %r13476, %r7585, 98304; + add.s32 %r13477, %r13476, %r13475; + add.s32 %r13478, %r13477, %r767; + ld.shared.v2.b32 {%r13479, %r13480}, [%r13478]; + ld.shared.v2.b32 {%r13481, %r13482}, [%r13478+32]; + ld.shared.v2.b32 {%r13483, %r13484}, [%r13478+64]; + ld.shared.v2.b32 {%r13485, %r13486}, [%r13478+96]; + ld.shared.v2.b32 {%r13487, %r13488}, [%r13478+128]; + ld.shared.v2.b32 {%r13489, %r13490}, [%r13478+160]; + ld.shared.v2.b32 {%r13491, %r13492}, [%r13478+192]; + ld.shared.v2.b32 {%r13493, %r13494}, [%r13478+224]; + .loc 1 675 26 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:675:26 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.eq.f32 %p1118, %r13479, 0fFF800000; + setp.eq.f32 %p1119, %r13480, 0fFF800000; + setp.eq.f32 %p1120, %r13481, 0fFF800000; + setp.eq.f32 %p1121, %r13482, 0fFF800000; + setp.eq.f32 %p1122, %r13483, 0fFF800000; + setp.eq.f32 %p1123, %r13484, 0fFF800000; + setp.eq.f32 %p1124, %r13485, 0fFF800000; + setp.eq.f32 %p1125, %r13486, 0fFF800000; + setp.eq.f32 %p1126, %r13487, 0fFF800000; + setp.eq.f32 %p1127, %r13488, 0fFF800000; + setp.eq.f32 %p1128, %r13489, 0fFF800000; + setp.eq.f32 %p1129, %r13490, 0fFF800000; + setp.eq.f32 %p1130, %r13491, 0fFF800000; + setp.eq.f32 %p1131, %r13492, 0fFF800000; + setp.eq.f32 %p1132, %r13493, 0fFF800000; + setp.eq.f32 %p1133, %r13494, 0fFF800000; + .loc 1 675 46 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:675:46 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13495, 0f00000000, %r13479, %p1118; + selp.f32 %r13496, 0f00000000, %r13480, %p1119; + selp.f32 %r13497, 0f00000000, %r13481, %p1120; + selp.f32 %r13498, 0f00000000, %r13482, %p1121; + selp.f32 %r13499, 0f00000000, %r13483, %p1122; + selp.f32 %r13500, 0f00000000, %r13484, %p1123; + selp.f32 %r13501, 0f00000000, %r13485, %p1124; + selp.f32 %r13502, 0f00000000, %r13486, %p1125; + selp.f32 %r13503, 0f00000000, %r13487, %p1126; + selp.f32 %r13504, 0f00000000, %r13488, %p1127; + selp.f32 %r13505, 0f00000000, %r13489, %p1128; + selp.f32 %r13506, 0f00000000, %r13490, %p1129; + selp.f32 %r13507, 0f00000000, %r13491, %p1130; + selp.f32 %r13508, 0f00000000, %r13492, %p1131; + selp.f32 %r13509, 0f00000000, %r13493, %p1132; + selp.f32 %r13510, 0f00000000, %r13494, %p1133; + .loc 1 676 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:676:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shfl.sync.idx.b32 %r13511, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r13512, %r13511, 11; + and.b32 %r13513, %r13512, 8192; + add.s32 %r11733, %r7585, 99328; + add.s32 %r13514, %r13513, %r11733; + bfe.u32 %r13515, %r13514, 4, 14; + cvt.u64.u32 %rd1080, %r13515; + or.b64 %rd994, %rd1080, 4611686293372403712; + bfe.u32 %r13516, %r11774, 4, 14; + cvt.u64.u32 %rd1081, %r13516; + or.b64 %rd995, %rd1081, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd994, %rd995, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r13517, %r13513, 32; + add.s32 %r13518, %r13517, %r11733; + bfe.u32 %r13519, %r13518, 4, 14; + cvt.u64.u32 %rd1082, %r13519; + or.b64 %rd996, %rd1082, 4611686293372403712; + add.s32 %r13520, %r11774, 32; + bfe.u32 %r13521, %r13520, 4, 14; + cvt.u64.u32 %rd1083, %r13521; + or.b64 %rd997, %rd1083, 4611686293338849280; + mov.pred %p1043, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd996, %rd997, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13522, %r13513, 64; + add.s32 %r13523, %r13522, %r11733; + bfe.u32 %r13524, %r13523, 4, 14; + cvt.u64.u32 %rd1084, %r13524; + or.b64 %rd998, %rd1084, 4611686293372403712; + add.s32 %r13525, %r11774, 64; + bfe.u32 %r13526, %r13525, 4, 14; + cvt.u64.u32 %rd1085, %r13526; + or.b64 %rd999, %rd1085, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd998, %rd999, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13527, %r13513, 96; + add.s32 %r13528, %r13527, %r11733; + bfe.u32 %r13529, %r13528, 4, 14; + cvt.u64.u32 %rd1086, %r13529; + or.b64 %rd1000, %rd1086, 4611686293372403712; + add.s32 %r13530, %r11774, 96; + bfe.u32 %r13531, %r13530, 4, 14; + cvt.u64.u32 %rd1087, %r13531; + or.b64 %rd1001, %rd1087, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1000, %rd1001, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13532, %r13513, 16384; + add.s32 %r13533, %r13532, %r11733; + bfe.u32 %r13534, %r13533, 4, 14; + cvt.u64.u32 %rd1088, %r13534; + or.b64 %rd1002, %rd1088, 4611686293372403712; + add.s32 %r13535, %r11774, 8192; + bfe.u32 %r13536, %r13535, 4, 14; + cvt.u64.u32 %rd1089, %r13536; + or.b64 %rd1003, %rd1089, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1002, %rd1003, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13537, %r13513, 16416; + add.s32 %r13538, %r13537, %r11733; + bfe.u32 %r13539, %r13538, 4, 14; + cvt.u64.u32 %rd1090, %r13539; + or.b64 %rd1004, %rd1090, 4611686293372403712; + add.s32 %r13540, %r11774, 8224; + bfe.u32 %r13541, %r13540, 4, 14; + cvt.u64.u32 %rd1091, %r13541; + or.b64 %rd1005, %rd1091, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1004, %rd1005, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13542, %r13513, 16448; + add.s32 %r13543, %r13542, %r11733; + bfe.u32 %r13544, %r13543, 4, 14; + cvt.u64.u32 %rd1092, %r13544; + or.b64 %rd1006, %rd1092, 4611686293372403712; + add.s32 %r13545, %r11774, 8256; + bfe.u32 %r13546, %r13545, 4, 14; + cvt.u64.u32 %rd1093, %r13546; + or.b64 %rd1007, %rd1093, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1006, %rd1007, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13547, %r13513, 16480; + add.s32 %r13548, %r13547, %r11733; + bfe.u32 %r13549, %r13548, 4, 14; + cvt.u64.u32 %rd1094, %r13549; + or.b64 %rd1008, %rd1094, 4611686293372403712; + add.s32 %r13550, %r11774, 8288; + bfe.u32 %r13551, %r13550, 4, 14; + cvt.u64.u32 %rd1095, %r13551; + or.b64 %rd1009, %rd1095, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1008, %rd1009, %p1043, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r11736, %r11774; + mov.b32 %r11734, %r11772; + mov.b32 %r11735, %r11772; + mov.b32 %r11737, %r11772; + mov.b32 %r11738, %r11772; + // begin inline asm + // wait for regs: %r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348,%r11733,%r11734,%r11735,%r11736,%r11737,%r11738 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:678:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13552, %r11317, 0f3DB504F3; + mul.f32 %r13553, %r11318, 0f3DB504F3; + mul.f32 %r13554, %r11319, 0f3DB504F3; + mul.f32 %r13555, %r11320, 0f3DB504F3; + mul.f32 %r13556, %r11321, 0f3DB504F3; + mul.f32 %r13557, %r11322, 0f3DB504F3; + mul.f32 %r13558, %r11323, 0f3DB504F3; + mul.f32 %r13559, %r11324, 0f3DB504F3; + mul.f32 %r13560, %r11325, 0f3DB504F3; + mul.f32 %r13561, %r11326, 0f3DB504F3; + mul.f32 %r13562, %r11327, 0f3DB504F3; + mul.f32 %r13563, %r11328, 0f3DB504F3; + mul.f32 %r13564, %r11329, 0f3DB504F3; + mul.f32 %r13565, %r11330, 0f3DB504F3; + mul.f32 %r13566, %r11331, 0f3DB504F3; + mul.f32 %r13567, %r11332, 0f3DB504F3; + mul.f32 %r13568, %r11333, 0f3DB504F3; + mul.f32 %r13569, %r11334, 0f3DB504F3; + mul.f32 %r13570, %r11335, 0f3DB504F3; + mul.f32 %r13571, %r11336, 0f3DB504F3; + mul.f32 %r13572, %r11337, 0f3DB504F3; + mul.f32 %r13573, %r11338, 0f3DB504F3; + mul.f32 %r13574, %r11339, 0f3DB504F3; + mul.f32 %r13575, %r11340, 0f3DB504F3; + mul.f32 %r13576, %r11341, 0f3DB504F3; + mul.f32 %r13577, %r11342, 0f3DB504F3; + mul.f32 %r13578, %r11343, 0f3DB504F3; + mul.f32 %r13579, %r11344, 0f3DB504F3; + mul.f32 %r13580, %r11345, 0f3DB504F3; + mul.f32 %r13581, %r11346, 0f3DB504F3; + mul.f32 %r13582, %r11347, 0f3DB504F3; + mul.f32 %r13583, %r11348, 0f3DB504F3; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13584, %r13552, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13585, %r13584, 0fFF800000, %p1102; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13586, %r13553, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13587, %r13586, 0fFF800000, %p1103; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13588, %r13554, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13589, %r13588, 0fFF800000, %p1102; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13590, %r13555, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13591, %r13590, 0fFF800000, %p1103; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13592, %r13556, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13593, %r13592, 0fFF800000, %p1104; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13594, %r13557, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13595, %r13594, 0fFF800000, %p1105; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13596, %r13558, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13597, %r13596, 0fFF800000, %p1104; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13598, %r13559, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13599, %r13598, 0fFF800000, %p1105; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13600, %r13560, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13601, %r13600, 0fFF800000, %p1106; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13602, %r13561, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13603, %r13602, 0fFF800000, %p1107; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13604, %r13562, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13605, %r13604, 0fFF800000, %p1106; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13606, %r13563, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13607, %r13606, 0fFF800000, %p1107; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13608, %r13564, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13609, %r13608, 0fFF800000, %p1108; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13610, %r13565, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13611, %r13610, 0fFF800000, %p1109; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13612, %r13566, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13613, %r13612, 0fFF800000, %p1108; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13614, %r13567, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13615, %r13614, 0fFF800000, %p1109; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13616, %r13568, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13617, %r13616, 0fFF800000, %p1110; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13618, %r13569, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13619, %r13618, 0fFF800000, %p1111; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13620, %r13570, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13621, %r13620, 0fFF800000, %p1110; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13622, %r13571, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13623, %r13622, 0fFF800000, %p1111; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13624, %r13572, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13625, %r13624, 0fFF800000, %p1112; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13626, %r13573, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13627, %r13626, 0fFF800000, %p1113; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13628, %r13574, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13629, %r13628, 0fFF800000, %p1112; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13630, %r13575, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13631, %r13630, 0fFF800000, %p1113; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13632, %r13576, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13633, %r13632, 0fFF800000, %p1114; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13634, %r13577, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13635, %r13634, 0fFF800000, %p1115; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13636, %r13578, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13637, %r13636, 0fFF800000, %p1114; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13638, %r13579, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13639, %r13638, 0fFF800000, %p1115; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13640, %r13580, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13641, %r13640, 0fFF800000, %p1116; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13642, %r13581, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13643, %r13642, 0fFF800000, %p1117; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13644, %r13582, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13645, %r13644, 0fFF800000, %p1116; + .loc 1 739 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:739:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13646, %r13583, 0f3FB8AA3B; + .loc 1 692 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:692:78 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.f32 %r13647, %r13646, 0fFF800000, %p1117; + .loc 1 740 40 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:740:40 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + sub.f32 %r13648, %r13585, %r13495; + sub.f32 %r13649, %r13587, %r13496; + sub.f32 %r13650, %r13589, %r13495; + sub.f32 %r13651, %r13591, %r13496; + sub.f32 %r13652, %r13593, %r13497; + sub.f32 %r13653, %r13595, %r13498; + sub.f32 %r13654, %r13597, %r13497; + sub.f32 %r13655, %r13599, %r13498; + sub.f32 %r13656, %r13601, %r13499; + sub.f32 %r13657, %r13603, %r13500; + sub.f32 %r13658, %r13605, %r13499; + sub.f32 %r13659, %r13607, %r13500; + sub.f32 %r13660, %r13609, %r13501; + sub.f32 %r13661, %r13611, %r13502; + sub.f32 %r13662, %r13613, %r13501; + sub.f32 %r13663, %r13615, %r13502; + sub.f32 %r13664, %r13617, %r13503; + sub.f32 %r13665, %r13619, %r13504; + sub.f32 %r13666, %r13621, %r13503; + sub.f32 %r13667, %r13623, %r13504; + sub.f32 %r13668, %r13625, %r13505; + sub.f32 %r13669, %r13627, %r13506; + sub.f32 %r13670, %r13629, %r13505; + sub.f32 %r13671, %r13631, %r13506; + sub.f32 %r13672, %r13633, %r13507; + sub.f32 %r13673, %r13635, %r13508; + sub.f32 %r13674, %r13637, %r13507; + sub.f32 %r13675, %r13639, %r13508; + sub.f32 %r13676, %r13641, %r13509; + sub.f32 %r13677, %r13643, %r13510; + sub.f32 %r13678, %r13645, %r13509; + sub.f32 %r13679, %r13647, %r13510; + .loc 1 740 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:740:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + ex2.approx.ftz.f32 %r13680, %r13648; + ex2.approx.ftz.f32 %r13681, %r13649; + ex2.approx.ftz.f32 %r13682, %r13650; + ex2.approx.ftz.f32 %r13683, %r13651; + ex2.approx.ftz.f32 %r13684, %r13652; + ex2.approx.ftz.f32 %r13685, %r13653; + ex2.approx.ftz.f32 %r13686, %r13654; + ex2.approx.ftz.f32 %r13687, %r13655; + ex2.approx.ftz.f32 %r13688, %r13656; + ex2.approx.ftz.f32 %r13689, %r13657; + ex2.approx.ftz.f32 %r13690, %r13658; + ex2.approx.ftz.f32 %r13691, %r13659; + ex2.approx.ftz.f32 %r13692, %r13660; + ex2.approx.ftz.f32 %r13693, %r13661; + ex2.approx.ftz.f32 %r13694, %r13662; + ex2.approx.ftz.f32 %r13695, %r13663; + ex2.approx.ftz.f32 %r13696, %r13664; + ex2.approx.ftz.f32 %r13697, %r13665; + ex2.approx.ftz.f32 %r13698, %r13666; + ex2.approx.ftz.f32 %r13699, %r13667; + ex2.approx.ftz.f32 %r13700, %r13668; + ex2.approx.ftz.f32 %r13701, %r13669; + ex2.approx.ftz.f32 %r13702, %r13670; + ex2.approx.ftz.f32 %r13703, %r13671; + ex2.approx.ftz.f32 %r13704, %r13672; + ex2.approx.ftz.f32 %r13705, %r13673; + ex2.approx.ftz.f32 %r13706, %r13674; + ex2.approx.ftz.f32 %r13707, %r13675; + ex2.approx.ftz.f32 %r13708, %r13676; + ex2.approx.ftz.f32 %r13709, %r13677; + ex2.approx.ftz.f32 %r13710, %r13678; + ex2.approx.ftz.f32 %r13711, %r13679; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13712, %r7585, 49152; + add.s32 %r12820, %r13712, %r13473; + .loc 1 744 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:744:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16x2.f32 %r11905, %r13681, %r13680; + cvt.rn.bf16x2.f32 %r11906, %r13683, %r13682; + cvt.rn.bf16x2.f32 %r11907, %r13685, %r13684; + cvt.rn.bf16x2.f32 %r11908, %r13687, %r13686; + cvt.rn.bf16x2.f32 %r12037, %r13689, %r13688; + cvt.rn.bf16x2.f32 %r12038, %r13691, %r13690; + cvt.rn.bf16x2.f32 %r12039, %r13693, %r13692; + cvt.rn.bf16x2.f32 %r12040, %r13695, %r13694; + cvt.rn.bf16x2.f32 %r12169, %r13697, %r13696; + cvt.rn.bf16x2.f32 %r12170, %r13699, %r13698; + cvt.rn.bf16x2.f32 %r12171, %r13701, %r13700; + cvt.rn.bf16x2.f32 %r12172, %r13703, %r13702; + cvt.rn.bf16x2.f32 %r12301, %r13705, %r13704; + cvt.rn.bf16x2.f32 %r12302, %r13707, %r13706; + cvt.rn.bf16x2.f32 %r12303, %r13709, %r13708; + cvt.rn.bf16x2.f32 %r12304, %r13711, %r13710; + .loc 1 744 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:744:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + wgmma.fence.sync.aligned; + bfe.u32 %r13713, %r12820, 4, 14; + cvt.u64.u32 %rd1096, %r13713; + or.b64 %rd1010, %rd1096, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r11905,%r11906,%r11907,%r11908}, %rd1010, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13714, %r12820, 2048; + bfe.u32 %r13715, %r13714, 4, 14; + cvt.u64.u32 %rd1097, %r13715; + or.b64 %rd1011, %rd1097, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r12037,%r12038,%r12039,%r12040}, %rd1011, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13716, %r12820, 4096; + bfe.u32 %r13717, %r13716, 4, 14; + cvt.u64.u32 %rd1098, %r13717; + or.b64 %rd1012, %rd1098, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r12169,%r12170,%r12171,%r12172}, %rd1012, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13718, %r12820, 6144; + bfe.u32 %r13719, %r13718, 4, 14; + cvt.u64.u32 %rd1099, %r13719; + or.b64 %rd1013, %rd1099, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r12301,%r12302,%r12303,%r12304}, %rd1013, %p1043, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13720, %r7585, 98816; + add.s32 %r13721, %r13720, %r13475; + add.s32 %r13722, %r13721, %r767; + ld.shared.v2.b32 {%r13723, %r13724}, [%r13722]; + ld.shared.v2.b32 {%r13725, %r13726}, [%r13722+32]; + ld.shared.v2.b32 {%r13727, %r13728}, [%r13722+64]; + ld.shared.v2.b32 {%r13729, %r13730}, [%r13722+96]; + ld.shared.v2.b32 {%r13731, %r13732}, [%r13722+128]; + ld.shared.v2.b32 {%r13733, %r13734}, [%r13722+160]; + ld.shared.v2.b32 {%r13735, %r13736}, [%r13722+192]; + ld.shared.v2.b32 {%r13737, %r13738}, [%r13722+224]; + .loc 1 750 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:750:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + wgmma.fence.sync.aligned; + add.s32 %r12817, %r7585, 132096; + add.s32 %r13739, %r13513, %r12817; + bfe.u32 %r13740, %r13739, 4, 14; + cvt.u64.u32 %rd1100, %r13740; + or.b64 %rd1014, %rd1100, 4611686293372403712; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1014, %rd1010, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r13741, %r13517, %r12817; + bfe.u32 %r13742, %r13741, 4, 14; + cvt.u64.u32 %rd1101, %r13742; + or.b64 %rd1016, %rd1101, 4611686293372403712; + add.s32 %r13743, %r12820, 32; + bfe.u32 %r13744, %r13743, 4, 14; + cvt.u64.u32 %rd1102, %r13744; + or.b64 %rd1017, %rd1102, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1016, %rd1017, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13745, %r13522, %r12817; + bfe.u32 %r13746, %r13745, 4, 14; + cvt.u64.u32 %rd1103, %r13746; + or.b64 %rd1018, %rd1103, 4611686293372403712; + add.s32 %r13747, %r12820, 64; + bfe.u32 %r13748, %r13747, 4, 14; + cvt.u64.u32 %rd1104, %r13748; + or.b64 %rd1019, %rd1104, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1018, %rd1019, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13749, %r13527, %r12817; + bfe.u32 %r13750, %r13749, 4, 14; + cvt.u64.u32 %rd1105, %r13750; + or.b64 %rd1020, %rd1105, 4611686293372403712; + add.s32 %r13751, %r12820, 96; + bfe.u32 %r13752, %r13751, 4, 14; + cvt.u64.u32 %rd1106, %r13752; + or.b64 %rd1021, %rd1106, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1020, %rd1021, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13753, %r13532, %r12817; + bfe.u32 %r13754, %r13753, 4, 14; + cvt.u64.u32 %rd1107, %r13754; + or.b64 %rd1022, %rd1107, 4611686293372403712; + add.s32 %r13755, %r12820, 8192; + bfe.u32 %r13756, %r13755, 4, 14; + cvt.u64.u32 %rd1108, %r13756; + or.b64 %rd1023, %rd1108, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1022, %rd1023, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13757, %r13537, %r12817; + bfe.u32 %r13758, %r13757, 4, 14; + cvt.u64.u32 %rd1109, %r13758; + or.b64 %rd1024, %rd1109, 4611686293372403712; + add.s32 %r13759, %r12820, 8224; + bfe.u32 %r13760, %r13759, 4, 14; + cvt.u64.u32 %rd1110, %r13760; + or.b64 %rd1025, %rd1110, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1024, %rd1025, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13761, %r13542, %r12817; + bfe.u32 %r13762, %r13761, 4, 14; + cvt.u64.u32 %rd1111, %r13762; + or.b64 %rd1026, %rd1111, 4611686293372403712; + add.s32 %r13763, %r12820, 8256; + bfe.u32 %r13764, %r13763, 4, 14; + cvt.u64.u32 %rd1112, %r13764; + or.b64 %rd1027, %rd1112, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1026, %rd1027, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13765, %r13547, %r12817; + bfe.u32 %r13766, %r13765, 4, 14; + cvt.u64.u32 %rd1113, %r13766; + or.b64 %rd1028, %rd1113, 4611686293372403712; + add.s32 %r13767, %r12820, 8288; + bfe.u32 %r13768, %r13767, 4, 14; + cvt.u64.u32 %rd1114, %r13768; + or.b64 %rd1029, %rd1114, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1028, %rd1029, %p1043, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r12822, %r11772; + mov.b32 %r12818, %r11772; + mov.b32 %r12819, %r11772; + mov.b32 %r12821, %r11772; + // begin inline asm + // wait for regs: %r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432,%r12817,%r12818,%r12819,%r12820,%r12821,%r12822 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:751:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + sub.f32 %r13769, %r12401, %r13723; + sub.f32 %r13770, %r12402, %r13724; + sub.f32 %r13771, %r12403, %r13723; + sub.f32 %r13772, %r12404, %r13724; + sub.f32 %r13773, %r12405, %r13725; + sub.f32 %r13774, %r12406, %r13726; + sub.f32 %r13775, %r12407, %r13725; + sub.f32 %r13776, %r12408, %r13726; + sub.f32 %r13777, %r12409, %r13727; + sub.f32 %r13778, %r12410, %r13728; + sub.f32 %r13779, %r12411, %r13727; + sub.f32 %r13780, %r12412, %r13728; + sub.f32 %r13781, %r12413, %r13729; + sub.f32 %r13782, %r12414, %r13730; + sub.f32 %r13783, %r12415, %r13729; + sub.f32 %r13784, %r12416, %r13730; + sub.f32 %r13785, %r12417, %r13731; + sub.f32 %r13786, %r12418, %r13732; + sub.f32 %r13787, %r12419, %r13731; + sub.f32 %r13788, %r12420, %r13732; + sub.f32 %r13789, %r12421, %r13733; + sub.f32 %r13790, %r12422, %r13734; + sub.f32 %r13791, %r12423, %r13733; + sub.f32 %r13792, %r12424, %r13734; + sub.f32 %r13793, %r12425, %r13735; + sub.f32 %r13794, %r12426, %r13736; + sub.f32 %r13795, %r12427, %r13735; + sub.f32 %r13796, %r12428, %r13736; + sub.f32 %r13797, %r12429, %r13737; + sub.f32 %r13798, %r12430, %r13738; + sub.f32 %r13799, %r12431, %r13737; + sub.f32 %r13800, %r12432, %r13738; + .loc 1 751 16 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:751:16 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.f32 %r13801, %r13680, %r13769; + mul.f32 %r13802, %r13681, %r13770; + mul.f32 %r13803, %r13682, %r13771; + mul.f32 %r13804, %r13683, %r13772; + mul.f32 %r13805, %r13684, %r13773; + mul.f32 %r13806, %r13685, %r13774; + mul.f32 %r13807, %r13686, %r13775; + mul.f32 %r13808, %r13687, %r13776; + mul.f32 %r13809, %r13688, %r13777; + mul.f32 %r13810, %r13689, %r13778; + mul.f32 %r13811, %r13690, %r13779; + mul.f32 %r13812, %r13691, %r13780; + mul.f32 %r13813, %r13692, %r13781; + mul.f32 %r13814, %r13693, %r13782; + mul.f32 %r13815, %r13694, %r13783; + mul.f32 %r13816, %r13695, %r13784; + mul.f32 %r13817, %r13696, %r13785; + mul.f32 %r13818, %r13697, %r13786; + mul.f32 %r13819, %r13698, %r13787; + mul.f32 %r13820, %r13699, %r13788; + mul.f32 %r13821, %r13700, %r13789; + mul.f32 %r13822, %r13701, %r13790; + mul.f32 %r13823, %r13702, %r13791; + mul.f32 %r13824, %r13703, %r13792; + mul.f32 %r13825, %r13704, %r13793; + mul.f32 %r13826, %r13705, %r13794; + mul.f32 %r13827, %r13706, %r13795; + mul.f32 %r13828, %r13707, %r13796; + mul.f32 %r13829, %r13708, %r13797; + mul.f32 %r13830, %r13709, %r13798; + mul.f32 %r13831, %r13710, %r13799; + mul.f32 %r13832, %r13711, %r13800; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs193, %r13801; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs194, %rs193, 0x0000, %p1102; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs195, %r13802; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs196, %rs195, 0x0000, %p1103; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs197, %r13803; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs198, %rs197, 0x0000, %p1102; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs199, %r13804; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs200, %rs199, 0x0000, %p1103; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs201, %r13805; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs202, %rs201, 0x0000, %p1104; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs203, %r13806; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs204, %rs203, 0x0000, %p1105; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs205, %r13807; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs206, %rs205, 0x0000, %p1104; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs207, %r13808; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs208, %rs207, 0x0000, %p1105; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs209, %r13809; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs210, %rs209, 0x0000, %p1106; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs211, %r13810; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs212, %rs211, 0x0000, %p1107; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs213, %r13811; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs214, %rs213, 0x0000, %p1106; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs215, %r13812; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs216, %rs215, 0x0000, %p1107; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs217, %r13813; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs218, %rs217, 0x0000, %p1108; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs219, %r13814; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs220, %rs219, 0x0000, %p1109; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs221, %r13815; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs222, %rs221, 0x0000, %p1108; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs223, %r13816; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs224, %rs223, 0x0000, %p1109; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs225, %r13817; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs226, %rs225, 0x0000, %p1110; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs227, %r13818; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs228, %rs227, 0x0000, %p1111; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs229, %r13819; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs230, %rs229, 0x0000, %p1110; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs231, %r13820; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs232, %rs231, 0x0000, %p1111; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs233, %r13821; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs234, %rs233, 0x0000, %p1112; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs235, %r13822; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs236, %rs235, 0x0000, %p1113; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs237, %r13823; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs238, %rs237, 0x0000, %p1112; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs239, %r13824; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs240, %rs239, 0x0000, %p1113; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs241, %r13825; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs242, %rs241, 0x0000, %p1114; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs243, %r13826; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs244, %rs243, 0x0000, %p1115; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs245, %r13827; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs246, %rs245, 0x0000, %p1114; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs247, %r13828; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs248, %rs247, 0x0000, %p1115; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs249, %r13829; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs250, %rs249, 0x0000, %p1116; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs251, %r13830; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs252, %rs251, 0x0000, %p1117; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs253, %r13831; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs254, %rs253, 0x0000, %p1116; + .loc 1 775 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + cvt.rn.bf16.f32 %rs255, %r13832; + .loc 1 759 70 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:759:70 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + selp.b16 %rs256, %rs255, 0x0000, %p1117; + .loc 1 775 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:775:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mov.b32 %r12989, {%rs194, %rs196}; + mov.b32 %r12990, {%rs198, %rs200}; + mov.b32 %r12991, {%rs202, %rs204}; + mov.b32 %r12992, {%rs206, %rs208}; + mov.b32 %r13121, {%rs210, %rs212}; + mov.b32 %r13122, {%rs214, %rs216}; + mov.b32 %r13123, {%rs218, %rs220}; + mov.b32 %r13124, {%rs222, %rs224}; + mov.b32 %r13253, {%rs226, %rs228}; + mov.b32 %r13254, {%rs230, %rs232}; + mov.b32 %r13255, {%rs234, %rs236}; + mov.b32 %r13256, {%rs238, %rs240}; + mov.b32 %r13385, {%rs242, %rs244}; + mov.b32 %r13386, {%rs246, %rs248}; + mov.b32 %r13387, {%rs250, %rs252}; + mov.b32 %r13388, {%rs254, %rs256}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r12989,%r12990,%r12991,%r12992}, %rd995, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13833, %r11774, 2048; + bfe.u32 %r13834, %r13833, 4, 14; + cvt.u64.u32 %rd1115, %r13834; + or.b64 %rd1031, %rd1115, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r13121,%r13122,%r13123,%r13124}, %rd1031, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13835, %r11774, 4096; + bfe.u32 %r13836, %r13835, 4, 14; + cvt.u64.u32 %rd1116, %r13836; + or.b64 %rd1032, %rd1116, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r13253,%r13254,%r13255,%r13256}, %rd1032, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13837, %r11774, 6144; + bfe.u32 %r13838, %r13837, 4, 14; + cvt.u64.u32 %rd1117, %r13838; + or.b64 %rd1033, %rd1117, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r13385,%r13386,%r13387,%r13388}, %rd1033, %p1043, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r2075, %r15371, 1; + .loc 1 788 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:788:33 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shr.u32 %r13839, %r2075, 1; + .loc 1 789 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mad.wide.u32 %rd1035, %r13839, 4, %rd563; + .loc 1 789 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + mov.u64 %rd1034, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd1034, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r13389, 0x0; + @%p1065 ld.global.L1::evict_last.L2::cache_hint.b32 { %r13389 }, [ %rd1035 + 0 ], %rd1034; + // end inline asm + .loc 1 790 109 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:109 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13840, %r13839, 1; + .loc 1 790 113 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:113 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.lt.s32 %p1134, %r13840, %r7563; + .loc 1 790 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:55 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd1038, %rd1035, 4; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + and.pred %p1066, %p1065, %p1134; + .loc 1 790 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + // begin inline asm + mov.u64 %rd1037, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd1037, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r13390, 0x0; + @%p1066 ld.global.L1::evict_last.L2::cache_hint.b32 { %r13390 }, [ %rd1038 + 0 ], %rd1037; + // end inline asm + .loc 1 791 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:791:35 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + and.b32 %r13841, %r15371, 1; + .loc 1 792 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:34 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + sub.s32 %r13842, %r13390, %r13389; + .loc 1 792 48 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:48 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13843, %r13842, 7; + .loc 1 792 63 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:63 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13844, %r13843, -64; + .loc 1 793 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + xor.b32 %r13845, %r13841, 1; + .loc 1 793 61 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:61 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13846, %r13841, 6; + .loc 1 793 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:42 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mad.lo.s32 %r13847, %r13844, %r13845, %r13846; + .loc 1 626 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:626:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13848, %r13847, 12; + .loc 1 626 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:626:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.wide.s32 %rd1118, %r13848, 2; + add.s64 %rd1209, %rd1209, %rd1118; + add.s64 %rd1208, %rd1208, %rd1118; + add.s64 %rd1207, %rd1207, %rd1118; + add.s64 %rd1206, %rd1206, %rd1118; + .loc 1 627 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:627:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13849, %r13847, 7; + .loc 1 627 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:627:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.wide.s32 %rd1119, %r13849, 2; + add.s64 %rd1205, %rd1205, %rd1119; + add.s64 %rd1204, %rd1204, %rd1119; + add.s64 %rd1203, %rd1203, %rd1119; + add.s64 %rd1202, %rd1202, %rd1119; + .loc 1 628 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:628:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r2076, %r13847, %r15234; + add.s32 %r2077, %r13847, %r15233; + add.s32 %r2078, %r13847, %r15232; + add.s32 %r2079, %r13847, %r15231; + add.s32 %r2080, %r13847, %r15230; + add.s32 %r2081, %r13847, %r15229; + add.s32 %r2082, %r13847, %r15228; + add.s32 %r2083, %r13847, %r15227; + add.s32 %r2084, %r13847, %r15226; + add.s32 %r2085, %r13847, %r15225; + add.s32 %r2086, %r13847, %r15224; + add.s32 %r2087, %r13847, %r15223; + add.s32 %r2088, %r13847, %r15222; + add.s32 %r2089, %r13847, %r15221; + add.s32 %r2090, %r13847, %r15220; + add.s32 %r2091, %r13847, %r15219; + add.s32 %r15239, %r13847, %r15239; + add.s32 %r15240, %r13847, %r15240; + add.s32 %r15241, %r13847, %r15241; + add.s32 %r15242, %r13847, %r15242; + add.s32 %r15235, %r13847, %r15235; + add.s32 %r15236, %r13847, %r15236; + add.s32 %r15237, %r13847, %r15237; + add.s32 %r15238, %r13847, %r15238; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13850, %r15216, 1; + setp.gt.s32 %p1135, %r13850, 1; + selp.b32 %r15216, 0, %r13850, %p1135; + add.s32 %r13851, %r15218, 1; + setp.gt.s32 %p1136, %r13851, 2; + selp.b32 %r15218, 0, %r13851, %p1136; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.lt.s32 %p1137, %r15239, %r2358; + setp.lt.s32 %p1138, %r15240, %r2358; + setp.lt.s32 %p1139, %r15241, %r2358; + setp.lt.s32 %p1140, %r15242, %r2358; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13852, %r15218, 14; + add.s32 %r13853, %r7585, %r13852; + bar.sync 0; + add.s32 %r13391, %r13853, %r745; + selp.b32 %r13854, 16, 0, %p1137; + selp.b32 %r13392, %r13854, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13391 + 0 ], [ %rd1209 + 0 ], 0x10, %r13392; + // end inline asm + add.s32 %r13393, %r13391, 2048; + selp.b32 %r13855, 16, 0, %p1138; + selp.b32 %r13394, %r13855, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13393 + 0 ], [ %rd1208 + 0 ], 0x10, %r13394; + // end inline asm + add.s32 %r13395, %r13391, 4096; + selp.b32 %r13856, 16, 0, %p1139; + selp.b32 %r13396, %r13856, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13395 + 0 ], [ %rd1207 + 0 ], 0x10, %r13396; + // end inline asm + add.s32 %r13397, %r13391, 6144; + selp.b32 %r13857, 16, 0, %p1140; + selp.b32 %r13398, %r13857, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13397 + 0 ], [ %rd1206 + 0 ], 0x10, %r13398; + // end inline asm + cp.async.commit_group; + .loc 1 674 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.lt.s32 %p1141, %r2076, %r2358; + setp.lt.s32 %p1142, %r2077, %r2358; + setp.lt.s32 %p1143, %r2078, %r2358; + setp.lt.s32 %p1144, %r2079, %r2358; + setp.lt.s32 %p1145, %r2080, %r2358; + setp.lt.s32 %p1146, %r2081, %r2358; + setp.lt.s32 %p1147, %r2082, %r2358; + setp.lt.s32 %p1148, %r2083, %r2358; + setp.lt.s32 %p1149, %r2084, %r2358; + setp.lt.s32 %p1150, %r2085, %r2358; + setp.lt.s32 %p1151, %r2086, %r2358; + setp.lt.s32 %p1152, %r2087, %r2358; + setp.lt.s32 %p1153, %r2088, %r2358; + setp.lt.s32 %p1154, %r2089, %r2358; + setp.lt.s32 %p1155, %r2090, %r2358; + setp.lt.s32 %p1156, %r2091, %r2358; + .loc 1 674 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + mul.wide.s32 %rd1120, %r2076, 4; + add.s64 %rd1044, %rd142, %rd1120; + mul.wide.s32 %rd1121, %r2077, 4; + add.s64 %rd1045, %rd142, %rd1121; + mul.wide.s32 %rd1122, %r2078, 4; + add.s64 %rd1046, %rd142, %rd1122; + mul.wide.s32 %rd1123, %r2079, 4; + add.s64 %rd1047, %rd142, %rd1123; + mul.wide.s32 %rd1124, %r2080, 4; + add.s64 %rd1048, %rd142, %rd1124; + mul.wide.s32 %rd1125, %r2081, 4; + add.s64 %rd1049, %rd142, %rd1125; + mul.wide.s32 %rd1126, %r2082, 4; + add.s64 %rd1050, %rd142, %rd1126; + mul.wide.s32 %rd1127, %r2083, 4; + add.s64 %rd1051, %rd142, %rd1127; + mul.wide.s32 %rd1128, %r2084, 4; + add.s64 %rd1052, %rd142, %rd1128; + mul.wide.s32 %rd1129, %r2085, 4; + add.s64 %rd1053, %rd142, %rd1129; + mul.wide.s32 %rd1130, %r2086, 4; + add.s64 %rd1054, %rd142, %rd1130; + mul.wide.s32 %rd1131, %r2087, 4; + add.s64 %rd1055, %rd142, %rd1131; + mul.wide.s32 %rd1132, %r2088, 4; + add.s64 %rd1056, %rd142, %rd1132; + mul.wide.s32 %rd1133, %r2089, 4; + add.s64 %rd1057, %rd142, %rd1133; + mul.wide.s32 %rd1134, %r2090, 4; + add.s64 %rd1058, %rd142, %rd1134; + mul.wide.s32 %rd1135, %r2091, 4; + add.s64 %rd1059, %rd142, %rd1135; + .loc 1 674 22 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:674:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + shl.b32 %r13858, %r15216, 8; + add.s32 %r13859, %r13476, %r13858; + add.s32 %r13399, %r13859, %r767; + selp.b32 %r13860, 4, 0, %p1141; + selp.b32 %r13440, %r13860, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13399 + 0 ], [ %rd1044 + 0 ], 0x4, %r13440; + // end inline asm + add.s32 %r13401, %r13399, 4; + selp.b32 %r13861, 4, 0, %p1142; + selp.b32 %r13442, %r13861, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13401 + 0 ], [ %rd1045 + 0 ], 0x4, %r13442; + // end inline asm + add.s32 %r13403, %r13399, 32; + selp.b32 %r13862, 4, 0, %p1143; + selp.b32 %r13444, %r13862, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13403 + 0 ], [ %rd1046 + 0 ], 0x4, %r13444; + // end inline asm + add.s32 %r13405, %r13399, 36; + selp.b32 %r13863, 4, 0, %p1144; + selp.b32 %r13446, %r13863, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13405 + 0 ], [ %rd1047 + 0 ], 0x4, %r13446; + // end inline asm + add.s32 %r13407, %r13399, 64; + selp.b32 %r13864, 4, 0, %p1145; + selp.b32 %r13448, %r13864, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13407 + 0 ], [ %rd1048 + 0 ], 0x4, %r13448; + // end inline asm + add.s32 %r13409, %r13399, 68; + selp.b32 %r13865, 4, 0, %p1146; + selp.b32 %r13450, %r13865, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13409 + 0 ], [ %rd1049 + 0 ], 0x4, %r13450; + // end inline asm + add.s32 %r13411, %r13399, 96; + selp.b32 %r13866, 4, 0, %p1147; + selp.b32 %r13452, %r13866, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13411 + 0 ], [ %rd1050 + 0 ], 0x4, %r13452; + // end inline asm + add.s32 %r13413, %r13399, 100; + selp.b32 %r13867, 4, 0, %p1148; + selp.b32 %r13454, %r13867, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13413 + 0 ], [ %rd1051 + 0 ], 0x4, %r13454; + // end inline asm + add.s32 %r13415, %r13399, 128; + selp.b32 %r13868, 4, 0, %p1149; + selp.b32 %r13456, %r13868, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13415 + 0 ], [ %rd1052 + 0 ], 0x4, %r13456; + // end inline asm + add.s32 %r13417, %r13399, 132; + selp.b32 %r13869, 4, 0, %p1150; + selp.b32 %r13458, %r13869, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13417 + 0 ], [ %rd1053 + 0 ], 0x4, %r13458; + // end inline asm + add.s32 %r13419, %r13399, 160; + selp.b32 %r13870, 4, 0, %p1151; + selp.b32 %r13460, %r13870, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13419 + 0 ], [ %rd1054 + 0 ], 0x4, %r13460; + // end inline asm + add.s32 %r13421, %r13399, 164; + selp.b32 %r13871, 4, 0, %p1152; + selp.b32 %r13462, %r13871, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13421 + 0 ], [ %rd1055 + 0 ], 0x4, %r13462; + // end inline asm + add.s32 %r13423, %r13399, 192; + selp.b32 %r13872, 4, 0, %p1153; + selp.b32 %r13464, %r13872, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13423 + 0 ], [ %rd1056 + 0 ], 0x4, %r13464; + // end inline asm + add.s32 %r13425, %r13399, 196; + selp.b32 %r13873, 4, 0, %p1154; + selp.b32 %r13466, %r13873, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13425 + 0 ], [ %rd1057 + 0 ], 0x4, %r13466; + // end inline asm + add.s32 %r13427, %r13399, 224; + selp.b32 %r13874, 4, 0, %p1155; + selp.b32 %r13468, %r13874, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13427 + 0 ], [ %rd1058 + 0 ], 0x4, %r13468; + // end inline asm + add.s32 %r13429, %r13399, 228; + selp.b32 %r13875, 4, 0, %p1156; + selp.b32 %r13470, %r13875, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13429 + 0 ], [ %rd1059 + 0 ], 0x4, %r13470; + // end inline asm + cp.async.commit_group; + .loc 1 833 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.lt.s32 %p1157, %r15235, %r2358; + setp.lt.s32 %p1158, %r15236, %r2358; + setp.lt.s32 %p1159, %r15237, %r2358; + setp.lt.s32 %p1160, %r15238, %r2358; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13876, %r13712, %r13852; + add.s32 %r13431, %r13876, %r745; + selp.b32 %r13877, 16, 0, %p1157; + selp.b32 %r13432, %r13877, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13431 + 0 ], [ %rd1205 + 0 ], 0x10, %r13432; + // end inline asm + add.s32 %r13433, %r13431, 2048; + selp.b32 %r13878, 16, 0, %p1158; + selp.b32 %r13434, %r13878, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13433 + 0 ], [ %rd1204 + 0 ], 0x10, %r13434; + // end inline asm + add.s32 %r13435, %r13431, 4096; + selp.b32 %r13879, 16, 0, %p1159; + selp.b32 %r13436, %r13879, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13435 + 0 ], [ %rd1203 + 0 ], 0x10, %r13436; + // end inline asm + add.s32 %r13437, %r13431, 6144; + selp.b32 %r13880, 16, 0, %p1160; + selp.b32 %r13438, %r13880, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13437 + 0 ], [ %rd1202 + 0 ], 0x10, %r13438; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s64 %rd1064, %rd143, %rd1120; + add.s64 %rd1065, %rd143, %rd1121; + add.s64 %rd1066, %rd143, %rd1122; + add.s64 %rd1067, %rd143, %rd1123; + add.s64 %rd1068, %rd143, %rd1124; + add.s64 %rd1069, %rd143, %rd1125; + add.s64 %rd1070, %rd143, %rd1126; + add.s64 %rd1071, %rd143, %rd1127; + add.s64 %rd1072, %rd143, %rd1128; + add.s64 %rd1073, %rd143, %rd1129; + add.s64 %rd1074, %rd143, %rd1130; + add.s64 %rd1075, %rd143, %rd1131; + add.s64 %rd1076, %rd143, %rd1132; + add.s64 %rd1077, %rd143, %rd1133; + add.s64 %rd1078, %rd143, %rd1134; + add.s64 %rd1079, %rd143, %rd1135; + .loc 1 748 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:748:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + add.s32 %r13881, %r13720, %r13858; + add.s32 %r13439, %r13881, %r767; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13439 + 0 ], [ %rd1064 + 0 ], 0x4, %r13440; + // end inline asm + add.s32 %r13441, %r13439, 4; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13441 + 0 ], [ %rd1065 + 0 ], 0x4, %r13442; + // end inline asm + add.s32 %r13443, %r13439, 32; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13443 + 0 ], [ %rd1066 + 0 ], 0x4, %r13444; + // end inline asm + add.s32 %r13445, %r13439, 36; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13445 + 0 ], [ %rd1067 + 0 ], 0x4, %r13446; + // end inline asm + add.s32 %r13447, %r13439, 64; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13447 + 0 ], [ %rd1068 + 0 ], 0x4, %r13448; + // end inline asm + add.s32 %r13449, %r13439, 68; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13449 + 0 ], [ %rd1069 + 0 ], 0x4, %r13450; + // end inline asm + add.s32 %r13451, %r13439, 96; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13451 + 0 ], [ %rd1070 + 0 ], 0x4, %r13452; + // end inline asm + add.s32 %r13453, %r13439, 100; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13453 + 0 ], [ %rd1071 + 0 ], 0x4, %r13454; + // end inline asm + add.s32 %r13455, %r13439, 128; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13455 + 0 ], [ %rd1072 + 0 ], 0x4, %r13456; + // end inline asm + add.s32 %r13457, %r13439, 132; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13457 + 0 ], [ %rd1073 + 0 ], 0x4, %r13458; + // end inline asm + add.s32 %r13459, %r13439, 160; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13459 + 0 ], [ %rd1074 + 0 ], 0x4, %r13460; + // end inline asm + add.s32 %r13461, %r13439, 164; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13461 + 0 ], [ %rd1075 + 0 ], 0x4, %r13462; + // end inline asm + add.s32 %r13463, %r13439, 192; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13463 + 0 ], [ %rd1076 + 0 ], 0x4, %r13464; + // end inline asm + add.s32 %r13465, %r13439, 196; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13465 + 0 ], [ %rd1077 + 0 ], 0x4, %r13466; + // end inline asm + add.s32 %r13467, %r13439, 224; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13467 + 0 ], [ %rd1078 + 0 ], 0x4, %r13468; + // end inline asm + add.s32 %r13469, %r13439, 228; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13469 + 0 ], [ %rd1079 + 0 ], 0x4, %r13470; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:610:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:318:20 ] + setp.ne.b32 %p1161, %r967, %r2075; + mov.b32 %r15199, %r15219; + mov.b32 %r15200, %r15220; + mov.b32 %r15201, %r15221; + mov.b32 %r15202, %r15222; + mov.b32 %r15203, %r15223; + mov.b32 %r15204, %r15224; + mov.b32 %r15205, %r15225; + mov.b32 %r15206, %r15226; + mov.b32 %r15207, %r15227; + mov.b32 %r15208, %r15228; + mov.b32 %r15209, %r15229; + mov.b32 %r15210, %r15230; + mov.b32 %r15211, %r15231; + mov.b32 %r15212, %r15232; + mov.b32 %r15213, %r15233; + mov.b32 %r15214, %r15234; + mov.b32 %r15219, %r2091; + mov.b32 %r15220, %r2090; + mov.b32 %r15221, %r2089; + mov.b32 %r15222, %r2088; + mov.b32 %r15223, %r2087; + mov.b32 %r15224, %r2086; + mov.b32 %r15225, %r2085; + mov.b32 %r15226, %r2084; + mov.b32 %r15227, %r2083; + mov.b32 %r15228, %r2082; + mov.b32 %r15229, %r2081; + mov.b32 %r15230, %r2080; + mov.b32 %r15231, %r2079; + mov.b32 %r15232, %r2078; + mov.b32 %r15233, %r2077; + mov.b32 %r15234, %r2076; + mov.b32 %r15371, %r2075; + @%p1161 bra $L__BB0_14; + bra.uni $L__BB0_15; +$L__tmp19: +$L__BB0_1: + .loc 1 0 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0:28 + ld.param.b32 %r2362, [triton_tem_fused_zeros_1_param_22]; + ld.param.b32 %r2361, [triton_tem_fused_zeros_1_param_21]; + ld.param.b32 %r2360, [triton_tem_fused_zeros_1_param_20]; + ld.param.b64 %rd204, [triton_tem_fused_zeros_1_param_13]; + ld.param.b64 %rd203, [triton_tem_fused_zeros_1_param_12]; + ld.param.b64 %rd200, [triton_tem_fused_zeros_1_param_9]; + ld.param.b64 %rd199, [triton_tem_fused_zeros_1_param_8]; + ld.param.b64 %rd198, [triton_tem_fused_zeros_1_param_6]; +$L__tmp20: + .loc 2 41 22 // standard.py:41:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:113:34 ] + add.s32 %r2548, %r2358, 127; + .loc 2 41 28 // standard.py:41:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:113:34 ] + shr.s32 %r2549, %r2548, 31; + shr.u32 %r2550, %r2549, 25; + add.s32 %r2551, %r2548, %r2550; + shr.s32 %r2552, %r2551, 7; +$L__tmp21: + .loc 1 140 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:140:24 + sub.s32 %r2553, %r4, %r5; + .loc 1 144 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:144:29 + div.s32 %r2555, %r2553, %r2552; + .loc 1 144 54 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:144:54 + shl.b32 %r2556, %r7, 2; + .loc 1 144 44 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:144:44 + add.s32 %r2557, %r2555, %r2556; + .loc 1 145 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:145:35 + mul.lo.s32 %r2558, %r2555, %r2552; + sub.s32 %r2559, %r2553, %r2558; + .loc 1 154 78 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:154:78 + mad.lo.s32 %r2560, %r2360, %r8, %r2559; + .loc 1 155 68 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:155:68 + mad.lo.s32 %r2561, %r2361, %r8, %r2559; + mul.lo.s32 %r2562, %r2561, %r2362; + .loc 1 158 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:158:30 + shl.b32 %r2563, %r2557, 7; + .loc 1 158 40 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:158:40 + mad.lo.s32 %r2564, %r1, %r6, %r2563; + .loc 1 159 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:159:55 + mul.lo.s32 %r2565, %r2, %r6; + .loc 1 159 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:159:42 + mad.lo.s32 %r2566, %r2557, %r3, %r2565; + .loc 1 161 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:161:30 + shl.b32 %r2567, %r6, 5; + .loc 1 161 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:161:35 + add.s32 %r2568, %r2557, %r2567; + .loc 1 161 46 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:161:46 + mul.lo.s32 %r2569, %r2568, %r2358; + .loc 1 163 17 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:163:17 + mul.wide.s32 %rd253, %r2564, 2; + add.s64 %rd254, %rd194, %rd253; + .loc 1 164 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:164:19 + mad.wide.s32 %rd255, %r2566, 2, %rd197; + .loc 1 168 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:168:21 + mul.wide.s32 %rd256, %r2569, 4; + add.s64 %rd257, %rd195, %rd256; + .loc 1 169 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:169:25 + add.s64 %rd258, %rd196, %rd256; + .loc 1 174 36 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:174:36 + shl.b32 %r2570, %r2559, 7; + .loc 1 175 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:175:29 + or.b32 %r23, %r2570, %r13; + or.b32 %r24, %r2570, %r14; + or.b32 %r25, %r2570, %r15; + or.b32 %r26, %r2570, %r16; + or.b32 %r27, %r2570, %r17; + or.b32 %r28, %r2570, %r18; + or.b32 %r29, %r2570, %r19; + or.b32 %r30, %r2570, %r20; + or.b32 %r31, %r2570, %r21; + or.b32 %r32, %r2570, %r22; +$L__tmp22: + .loc 1 825 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:178:107 ] + shl.b32 %r2571, %r23, 12; + shl.b32 %r2572, %r24, 12; + shl.b32 %r2573, %r25, 12; + shl.b32 %r2574, %r26, 12; + shl.b32 %r2575, %r27, 12; + shl.b32 %r2576, %r28, 12; + shl.b32 %r2577, %r29, 12; + shl.b32 %r2578, %r30, 12; + .loc 1 825 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:178:107 ] + mad.wide.s32 %rd259, %r2571, 2, %rd254; + mad.wide.s32 %rd260, %r2572, 2, %rd254; + mad.wide.s32 %rd261, %r2573, 2, %rd254; + mad.wide.s32 %rd262, %r2574, 2, %rd254; + mad.wide.s32 %rd263, %r2575, 2, %rd254; + mad.wide.s32 %rd264, %r2576, 2, %rd254; + mad.wide.s32 %rd265, %r2577, 2, %rd254; + mad.wide.s32 %rd266, %r2578, 2, %rd254; + .loc 1 825 56 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:56 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:178:107 ] + shl.b32 %r2579, %r10, 3; + and.b32 %r2580, %r2579, 120; + .loc 1 825 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:178:107 ] + cvt.u64.u32 %rd13, %r2580; + mul.wide.u32 %rd267, %r2580, 2; + add.s64 %rd213, %rd259, %rd267; + add.s64 %rd214, %rd260, %rd267; + add.s64 %rd215, %rd261, %rd267; + add.s64 %rd216, %rd262, %rd267; + add.s64 %rd217, %rd263, %rd267; + add.s64 %rd218, %rd264, %rd267; + add.s64 %rd219, %rd265, %rd267; + add.s64 %rd220, %rd266, %rd267; + .loc 1 833 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:178:107 ] + setp.lt.s32 %p14, %r23, %r2358; + setp.lt.s32 %p15, %r24, %r2358; + setp.lt.s32 %p16, %r25, %r2358; + setp.lt.s32 %p17, %r26, %r2358; + setp.lt.s32 %p18, %r27, %r2358; + setp.lt.s32 %p19, %r28, %r2358; + setp.lt.s32 %p20, %r29, %r2358; + setp.lt.s32 %p21, %r30, %r2358; + mov.b32 %r14520, 0; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:178:107 ] + // begin inline asm + mov.u32 %r2381, %r14520; + mov.u32 %r2382, %r14520; + mov.u32 %r2383, %r14520; + mov.u32 %r2384, %r14520; + @%p14 ld.global.v4.b32 { %r2381, %r2382, %r2383, %r2384 }, [ %rd213 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2389, %r14520; + mov.u32 %r2390, %r14520; + mov.u32 %r2391, %r14520; + mov.u32 %r2392, %r14520; + @%p15 ld.global.v4.b32 { %r2389, %r2390, %r2391, %r2392 }, [ %rd214 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2397, %r14520; + mov.u32 %r2398, %r14520; + mov.u32 %r2399, %r14520; + mov.u32 %r2400, %r14520; + @%p16 ld.global.v4.b32 { %r2397, %r2398, %r2399, %r2400 }, [ %rd215 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2405, %r14520; + mov.u32 %r2406, %r14520; + mov.u32 %r2407, %r14520; + mov.u32 %r2408, %r14520; + @%p17 ld.global.v4.b32 { %r2405, %r2406, %r2407, %r2408 }, [ %rd216 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2413, %r14520; + mov.u32 %r2414, %r14520; + mov.u32 %r2415, %r14520; + mov.u32 %r2416, %r14520; + @%p18 ld.global.v4.b32 { %r2413, %r2414, %r2415, %r2416 }, [ %rd217 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2421, %r14520; + mov.u32 %r2422, %r14520; + mov.u32 %r2423, %r14520; + mov.u32 %r2424, %r14520; + @%p19 ld.global.v4.b32 { %r2421, %r2422, %r2423, %r2424 }, [ %rd218 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2429, %r14520; + mov.u32 %r2430, %r14520; + mov.u32 %r2431, %r14520; + mov.u32 %r2432, %r14520; + @%p20 ld.global.v4.b32 { %r2429, %r2430, %r2431, %r2432 }, [ %rd219 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2437, %r14520; + mov.u32 %r2438, %r14520; + mov.u32 %r2439, %r14520; + mov.u32 %r2440, %r14520; + @%p21 ld.global.v4.b32 { %r2437, %r2438, %r2439, %r2440 }, [ %rd220 + 0 ]; + // end inline asm + shl.b32 %r2581, %r10, 4; + and.b32 %r2582, %r2581, 112; + shl.b32 %r2583, %r12, 3; + and.b32 %r2584, %r10, 112; + and.b32 %r2585, %r10, 8; + shl.b32 %r2586, %r2585, 11; + or.b32 %r2587, %r2582, %r2583; + xor.b32 %r2588, %r2587, %r2584; + or.b32 %r2589, %r2588, %r2586; + mov.b32 %r2590, global_smem; + add.s32 %r2591, %r2590, %r2589; + st.shared.v4.b32 [%r2591+98304], {%r2381, %r2382, %r2383, %r2384}; + st.shared.v4.b32 [%r2591+100352], {%r2389, %r2390, %r2391, %r2392}; + st.shared.v4.b32 [%r2591+102400], {%r2397, %r2398, %r2399, %r2400}; + st.shared.v4.b32 [%r2591+104448], {%r2405, %r2406, %r2407, %r2408}; + st.shared.v4.b32 [%r2591+106496], {%r2413, %r2414, %r2415, %r2416}; + st.shared.v4.b32 [%r2591+108544], {%r2421, %r2422, %r2423, %r2424}; + st.shared.v4.b32 [%r2591+110592], {%r2429, %r2430, %r2431, %r2432}; + st.shared.v4.b32 [%r2591+112640], {%r2437, %r2438, %r2439, %r2440}; +$L__tmp23: + .loc 1 825 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:179:111 ] + shl.b32 %r2592, %r23, 7; + shl.b32 %r2593, %r24, 7; + shl.b32 %r2594, %r25, 7; + shl.b32 %r2595, %r26, 7; + shl.b32 %r2596, %r27, 7; + shl.b32 %r2597, %r28, 7; + shl.b32 %r2598, %r29, 7; + shl.b32 %r2599, %r30, 7; + .loc 1 825 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:179:111 ] + mad.wide.s32 %rd268, %r2592, 2, %rd255; + mad.wide.s32 %rd269, %r2593, 2, %rd255; + mad.wide.s32 %rd270, %r2594, 2, %rd255; + mad.wide.s32 %rd271, %r2595, 2, %rd255; + mad.wide.s32 %rd272, %r2596, 2, %rd255; + mad.wide.s32 %rd273, %r2597, 2, %rd255; + mad.wide.s32 %rd274, %r2598, 2, %rd255; + mad.wide.s32 %rd275, %r2599, 2, %rd255; + .loc 1 825 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:825:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:179:111 ] + add.s64 %rd221, %rd268, %rd267; + add.s64 %rd222, %rd269, %rd267; + add.s64 %rd223, %rd270, %rd267; + add.s64 %rd224, %rd271, %rd267; + add.s64 %rd225, %rd272, %rd267; + add.s64 %rd226, %rd273, %rd267; + add.s64 %rd227, %rd274, %rd267; + add.s64 %rd228, %rd275, %rd267; + .loc 1 833 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:833:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:179:111 ] + // begin inline asm + mov.u32 %r2445, %r14520; + mov.u32 %r2446, %r14520; + mov.u32 %r2447, %r14520; + mov.u32 %r2448, %r14520; + @%p14 ld.global.v4.b32 { %r2445, %r2446, %r2447, %r2448 }, [ %rd221 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2453, %r14520; + mov.u32 %r2454, %r14520; + mov.u32 %r2455, %r14520; + mov.u32 %r2456, %r14520; + @%p15 ld.global.v4.b32 { %r2453, %r2454, %r2455, %r2456 }, [ %rd222 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2461, %r14520; + mov.u32 %r2462, %r14520; + mov.u32 %r2463, %r14520; + mov.u32 %r2464, %r14520; + @%p16 ld.global.v4.b32 { %r2461, %r2462, %r2463, %r2464 }, [ %rd223 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2469, %r14520; + mov.u32 %r2470, %r14520; + mov.u32 %r2471, %r14520; + mov.u32 %r2472, %r14520; + @%p17 ld.global.v4.b32 { %r2469, %r2470, %r2471, %r2472 }, [ %rd224 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2477, %r14520; + mov.u32 %r2478, %r14520; + mov.u32 %r2479, %r14520; + mov.u32 %r2480, %r14520; + @%p18 ld.global.v4.b32 { %r2477, %r2478, %r2479, %r2480 }, [ %rd225 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2485, %r14520; + mov.u32 %r2486, %r14520; + mov.u32 %r2487, %r14520; + mov.u32 %r2488, %r14520; + @%p19 ld.global.v4.b32 { %r2485, %r2486, %r2487, %r2488 }, [ %rd226 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2493, %r14520; + mov.u32 %r2494, %r14520; + mov.u32 %r2495, %r14520; + mov.u32 %r2496, %r14520; + @%p20 ld.global.v4.b32 { %r2493, %r2494, %r2495, %r2496 }, [ %rd227 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2501, %r14520; + mov.u32 %r2502, %r14520; + mov.u32 %r2503, %r14520; + mov.u32 %r2504, %r14520; + @%p21 ld.global.v4.b32 { %r2501, %r2502, %r2503, %r2504 }, [ %rd228 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r2591+131072], {%r2445, %r2446, %r2447, %r2448}; + st.shared.v4.b32 [%r2591+133120], {%r2453, %r2454, %r2455, %r2456}; + st.shared.v4.b32 [%r2591+135168], {%r2461, %r2462, %r2463, %r2464}; + st.shared.v4.b32 [%r2591+137216], {%r2469, %r2470, %r2471, %r2472}; + st.shared.v4.b32 [%r2591+139264], {%r2477, %r2478, %r2479, %r2480}; + st.shared.v4.b32 [%r2591+141312], {%r2485, %r2486, %r2487, %r2488}; + st.shared.v4.b32 [%r2591+143360], {%r2493, %r2494, %r2495, %r2496}; + st.shared.v4.b32 [%r2591+145408], {%r2501, %r2502, %r2503, %r2504}; +$L__tmp24: + .loc 1 188 58 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:188:58 + setp.lt.s32 %p30, %r31, %r2358; + setp.lt.s32 %p31, %r32, %r2358; + .loc 1 188 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:188:34 + mul.wide.s32 %rd276, %r31, 4; + add.s64 %rd229, %rd258, %rd276; + cvt.s64.s32 %rd277, %r2570; + cvt.u64.u32 %rd278, %r21; + or.b64 %rd279, %rd277, %rd278; + shl.b64 %rd280, %rd279, 2; + add.s64 %rd281, %rd258, %rd280; + add.s64 %rd230, %rd281, 32; + .loc 1 188 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:188:25 + // begin inline asm + mov.u32 %r2509, 0x0; + @%p30 ld.global.b32 { %r2509 }, [ %rd229 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2510, 0x0; + @%p31 ld.global.b32 { %r2510 }, [ %rd230 + 0 ]; + // end inline asm + .loc 1 189 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:189:33 + add.s64 %rd231, %rd257, %rd276; + add.s64 %rd282, %rd257, %rd280; + add.s64 %rd232, %rd282, 32; + .loc 1 189 26 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:189:26 + // begin inline asm + mov.u32 %r2511, 0x0; + @%p30 ld.global.b32 { %r2511 }, [ %rd231 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2512, 0x0; + @%p31 ld.global.b32 { %r2512 }, [ %rd232 + 0 ]; + // end inline asm + .loc 1 190 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:190:30 + setp.eq.f32 %p35, %r2511, 0fFF800000; + setp.eq.f32 %p36, %r2512, 0fFF800000; + .loc 1 190 50 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:190:50 + selp.f32 %r35, 0f00000000, %r2511, %p35; + selp.f32 %r36, 0f00000000, %r2512, %p36; + .loc 1 195 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:195:30 + cvt.s64.s32 %rd14, %r2562; + mad.wide.s32 %rd233, %r2562, 4, %rd200; + .loc 1 196 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:196:27 + // begin inline asm + mov.u32 %r2513, 0x0; + ld.global.b32 { %r2513 }, [ %rd233 + 0 ]; + // end inline asm + .loc 1 196 41 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:196:41 + shl.b32 %r37, %r2513, 7; + .loc 1 197 53 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:197:53 + cvt.s64.s32 %rd16, %r2560; + mad.wide.s32 %rd234, %r2560, 4, %rd199; + .loc 1 197 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:197:39 + // begin inline asm + mov.u32 %r2514, 0x0; + ld.global.b32 { %r2514 }, [ %rd234 + 0 ]; + // end inline asm + .loc 1 199 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:199:42 + and.b32 %r39, %r10, 3; + shl.b32 %r42, %r39, 1; + or.b32 %r41, %r42, 1; + or.b32 %r45, %r42, 9; + or.b32 %r44, %r42, 8; + or.b32 %r49, %r42, 16; + or.b32 %r48, %r42, 17; + or.b32 %r47, %r42, 24; + or.b32 %r46, %r42, 25; + or.b32 %r57, %r42, 32; + or.b32 %r56, %r42, 33; + or.b32 %r55, %r42, 40; + or.b32 %r54, %r42, 41; + or.b32 %r53, %r42, 48; + or.b32 %r52, %r42, 49; + or.b32 %r51, %r42, 56; + or.b32 %r50, %r42, 57; + .loc 1 199 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:199:29 + or.b32 %r2600, %r37, %r13; + or.b32 %r2601, %r37, %r14; + or.b32 %r2602, %r37, %r15; + or.b32 %r2603, %r37, %r16; +$L__tmp25: + .loc 1 390 37 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:390:37 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r2604, %r2600, 7; + shl.b32 %r2605, %r2601, 7; + shl.b32 %r2606, %r2602, 7; + shl.b32 %r2607, %r2603, 7; + .loc 1 390 18 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:390:18 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.wide.s32 %rd283, %r2604, 2; + add.s64 %rd284, %rd1, %rd283; + mul.wide.s32 %rd285, %r2605, 2; + add.s64 %rd286, %rd1, %rd285; + mul.wide.s32 %rd287, %r2606, 2; + add.s64 %rd288, %rd1, %rd287; + mul.wide.s32 %rd289, %r2607, 2; + add.s64 %rd290, %rd1, %rd289; + .loc 1 390 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:390:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd237, %rd284, %rd267; + add.s64 %rd238, %rd286, %rd267; + add.s64 %rd239, %rd288, %rd267; + add.s64 %rd240, %rd290, %rd267; + .loc 1 391 18 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:391:18 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd291, %rd2, %rd283; + add.s64 %rd292, %rd2, %rd285; + add.s64 %rd293, %rd2, %rd287; + add.s64 %rd294, %rd2, %rd289; + .loc 1 391 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:391:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd241, %rd291, %rd267; + add.s64 %rd242, %rd292, %rd267; + add.s64 %rd243, %rd293, %rd267; + add.s64 %rd244, %rd294, %rd267; + .loc 1 395 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:395:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r2608, %r2514, 1; + .loc 2 41 22 // standard.py:41:22 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r2609, %r2359, 63; + .loc 2 41 28 // standard.py:41:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r2610, %r2609, 31; + shr.u32 %r2611, %r2610, 26; + add.s32 %r2612, %r2609, %r2611; + shr.s32 %r2613, %r2612, 6; + .loc 1 395 101 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:395:101 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + max.s32 %r58, %r2613, 1; + .loc 1 395 63 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:395:63 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + min.s32 %r59, %r2608, %r58; + .loc 1 485 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:485:34 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mad.wide.u32 %rd236, %r6, 8, %rd207; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p37, %r2608, 1; + setp.gt.s32 %p34, %r2608, 0; + .loc 1 485 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:485:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + // begin inline asm + mov.u64 %rd17, 0x0; + @%p34 ld.global.b64 { %rd17 }, [ %rd236 + 0 ]; + // end inline asm + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p38, %r2600, %r2359; + setp.lt.s32 %p39, %r2601, %r2359; + setp.lt.s32 %p40, %r2602, %r2359; + setp.lt.s32 %p41, %r2603, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r2614, %r2585, 10; + or.b32 %r60, %r2588, %r2614; + add.s32 %r5079, %r2590, %r60; + selp.b32 %r2615, 16, 0, %p38; + selp.b32 %r2524, %r2615, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5079 + 0 ], [ %rd237 + 0 ], 0x10, %r2524; + // end inline asm + add.s32 %r5081, %r5079, 2048; + selp.b32 %r2616, 16, 0, %p39; + selp.b32 %r2526, %r2616, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5081 + 0 ], [ %rd238 + 0 ], 0x10, %r2526; + // end inline asm + add.s32 %r5083, %r5079, 4096; + selp.b32 %r2617, 16, 0, %p40; + selp.b32 %r2528, %r2617, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5083 + 0 ], [ %rd239 + 0 ], 0x10, %r2528; + // end inline asm + add.s32 %r5085, %r5079, 6144; + selp.b32 %r2618, 16, 0, %p41; + selp.b32 %r2530, %r2618, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5085 + 0 ], [ %rd240 + 0 ], 0x10, %r2530; + // end inline asm + cp.async.commit_group; + add.s32 %r2523, %r5079, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r2523 + 0 ], [ %rd241 + 0 ], 0x10, %r2524; + // end inline asm + add.s32 %r2525, %r5079, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r2525 + 0 ], [ %rd242 + 0 ], 0x10, %r2526; + // end inline asm + add.s32 %r2527, %r5079, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r2527 + 0 ], [ %rd243 + 0 ], 0x10, %r2528; + // end inline asm + add.s32 %r2529, %r5079, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r2529 + 0 ], [ %rd244 + 0 ], 0x10, %r2530; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.gt.s32 %p42, %r59, 1; + .loc 1 414 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:414:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd1184, %rd237, 16384; + add.s64 %rd1183, %rd238, 16384; + add.s64 %rd1182, %rd239, 16384; + add.s64 %rd1181, %rd240, 16384; + .loc 1 415 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:415:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd1180, %rd241, 16384; + add.s64 %rd1179, %rd242, 16384; + add.s64 %rd1178, %rd243, 16384; + add.s64 %rd1177, %rd244, 16384; + .loc 1 417 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:417:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.b32 %r14455, %r2600, 64; + or.b32 %r14454, %r2601, 64; + or.b32 %r14453, %r2602, 64; + or.b32 %r14452, %r2603, 64; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p43, %r14455, %r2359; + setp.lt.s32 %p44, %r14454, %r2359; + setp.lt.s32 %p45, %r14453, %r2359; + setp.lt.s32 %p46, %r14452, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + bar.sync 0; + add.s32 %r2531, %r5079, 16384; + selp.b32 %r2619, 16, 0, %p43; + selp.b32 %r2540, %r2619, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2531 + 0 ], [ %rd1184 + 0 ], 0x10, %r2540; + // end inline asm + add.s32 %r2533, %r5079, 18432; + selp.b32 %r2620, 16, 0, %p44; + selp.b32 %r2542, %r2620, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2533 + 0 ], [ %rd1183 + 0 ], 0x10, %r2542; + // end inline asm + add.s32 %r2535, %r5079, 20480; + selp.b32 %r2621, 16, 0, %p45; + selp.b32 %r2544, %r2621, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2535 + 0 ], [ %rd1182 + 0 ], 0x10, %r2544; + // end inline asm + add.s32 %r2537, %r5079, 22528; + selp.b32 %r2622, 16, 0, %p46; + selp.b32 %r2546, %r2622, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2537 + 0 ], [ %rd1181 + 0 ], 0x10, %r2546; + // end inline asm + cp.async.commit_group; + add.s32 %r2539, %r5079, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r2539 + 0 ], [ %rd1180 + 0 ], 0x10, %r2540; + // end inline asm + add.s32 %r2541, %r5079, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r2541 + 0 ], [ %rd1179 + 0 ], 0x10, %r2542; + // end inline asm + add.s32 %r2543, %r5079, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r2543 + 0 ], [ %rd1178 + 0 ], 0x10, %r2544; + // end inline asm + add.s32 %r2545, %r5079, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r2545 + 0 ], [ %rd1177 + 0 ], 0x10, %r2546; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:459:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r14456, 0f00000000; + mov.b32 %r14457, %r14456; + mov.b32 %r14458, %r14456; + mov.b32 %r14459, %r14456; + mov.b32 %r14460, %r14456; + mov.b32 %r14461, %r14456; + mov.b32 %r14462, %r14456; + mov.b32 %r14463, %r14456; + mov.b32 %r14464, %r14456; + mov.b32 %r14465, %r14456; + mov.b32 %r14466, %r14456; + mov.b32 %r14467, %r14456; + mov.b32 %r14468, %r14456; + mov.b32 %r14469, %r14456; + mov.b32 %r14470, %r14456; + mov.b32 %r14471, %r14456; + mov.b32 %r14472, %r14456; + mov.b32 %r14473, %r14456; + mov.b32 %r14474, %r14456; + mov.b32 %r14475, %r14456; + mov.b32 %r14476, %r14456; + mov.b32 %r14477, %r14456; + mov.b32 %r14478, %r14456; + mov.b32 %r14479, %r14456; + mov.b32 %r14480, %r14456; + mov.b32 %r14481, %r14456; + mov.b32 %r14482, %r14456; + mov.b32 %r14483, %r14456; + mov.b32 %r14484, %r14456; + mov.b32 %r14485, %r14456; + mov.b32 %r14486, %r14456; + mov.b32 %r14487, %r14456; + mov.b32 %r14488, %r14456; + mov.b32 %r14489, %r14456; + mov.b32 %r14490, %r14456; + mov.b32 %r14491, %r14456; + mov.b32 %r14492, %r14456; + mov.b32 %r14493, %r14456; + mov.b32 %r14494, %r14456; + mov.b32 %r14495, %r14456; + mov.b32 %r14496, %r14456; + mov.b32 %r14497, %r14456; + mov.b32 %r14498, %r14456; + mov.b32 %r14499, %r14456; + mov.b32 %r14500, %r14456; + mov.b32 %r14501, %r14456; + mov.b32 %r14502, %r14456; + mov.b32 %r14503, %r14456; + mov.b32 %r14504, %r14456; + mov.b32 %r14505, %r14456; + mov.b32 %r14506, %r14456; + mov.b32 %r14507, %r14456; + mov.b32 %r14508, %r14456; + mov.b32 %r14509, %r14456; + mov.b32 %r14510, %r14456; + mov.b32 %r14511, %r14456; + mov.b32 %r14512, %r14456; + mov.b32 %r14513, %r14456; + mov.b32 %r14514, %r14456; + mov.b32 %r14515, %r14456; + mov.b32 %r14516, %r14456; + mov.b32 %r14517, %r14456; + mov.b32 %r14518, %r14456; + mov.b32 %r14519, %r14456; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + @%p37 bra $L__BB0_4; +// %bb.2: // %.lr.ph + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r100, %r32, %r2358; + .loc 1 487 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:487:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd295, %r100; + .loc 1 488 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:488:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.gt.s64 %p1, %rd17, %rd295; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r106, %r31, %r2358; + .loc 1 487 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:487:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd296, %r106; + .loc 1 488 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:488:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.gt.s64 %p3, %rd17, %rd296; +$L__tmp26: + .loc 1 199 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:199:29 + or.b32 %r14522, %r37, %r50; + or.b32 %r14521, %r37, %r51; + or.b32 %r14524, %r37, %r52; + or.b32 %r14523, %r37, %r53; + or.b32 %r14526, %r37, %r54; + or.b32 %r14525, %r37, %r55; + or.b32 %r14528, %r37, %r56; + or.b32 %r14527, %r37, %r57; + or.b32 %r14530, %r37, %r46; + or.b32 %r14529, %r37, %r47; + or.b32 %r14532, %r37, %r48; + or.b32 %r14531, %r37, %r49; + or.b32 %r14533, %r37, %r44; + or.b32 %r14534, %r37, %r45; + or.b32 %r14536, %r37, %r41; + or.b32 %r14535, %r37, %r42; + add.s32 %r97, %r59, -2; + add.s32 %r98, %r59, -1; +$L__tmp27: + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + max.s32 %r99, %r59, 1; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mov.b64 %rd28, {%r2510, %r2510}; + mov.b64 %rd29, {%r2509, %r2509}; + mov.b32 %r14456, 0f00000000; + mov.b32 %r14451, 1; + mov.b32 %r14450, -1; + mov.b32 %r14449, 64; + mov.b32 %r14457, %r14456; + mov.b32 %r14458, %r14456; + mov.b32 %r14459, %r14456; + mov.b32 %r14460, %r14456; + mov.b32 %r14461, %r14456; + mov.b32 %r14462, %r14456; + mov.b32 %r14463, %r14456; + mov.b32 %r14464, %r14456; + mov.b32 %r14465, %r14456; + mov.b32 %r14466, %r14456; + mov.b32 %r14467, %r14456; + mov.b32 %r14468, %r14456; + mov.b32 %r14469, %r14456; + mov.b32 %r14470, %r14456; + mov.b32 %r14471, %r14456; + mov.b32 %r14472, %r14456; + mov.b32 %r14473, %r14456; + mov.b32 %r14474, %r14456; + mov.b32 %r14475, %r14456; + mov.b32 %r14476, %r14456; + mov.b32 %r14477, %r14456; + mov.b32 %r14478, %r14456; + mov.b32 %r14479, %r14456; + mov.b32 %r14480, %r14456; + mov.b32 %r14481, %r14456; + mov.b32 %r14482, %r14456; + mov.b32 %r14483, %r14456; + mov.b32 %r14484, %r14456; + mov.b32 %r14485, %r14456; + mov.b32 %r14486, %r14456; + mov.b32 %r14487, %r14456; + mov.b32 %r14488, %r14456; + mov.b32 %r14489, %r14456; + mov.b32 %r14490, %r14456; + mov.b32 %r14491, %r14456; + mov.b32 %r14492, %r14456; + mov.b32 %r14493, %r14456; + mov.b32 %r14494, %r14456; + mov.b32 %r14495, %r14456; + mov.b32 %r14496, %r14456; + mov.b32 %r14497, %r14456; + mov.b32 %r14498, %r14456; + mov.b32 %r14499, %r14456; + mov.b32 %r14500, %r14456; + mov.b32 %r14501, %r14456; + mov.b32 %r14502, %r14456; + mov.b32 %r14503, %r14456; + mov.b32 %r14504, %r14456; + mov.b32 %r14505, %r14456; + mov.b32 %r14506, %r14456; + mov.b32 %r14507, %r14456; + mov.b32 %r14508, %r14456; + mov.b32 %r14509, %r14456; + mov.b32 %r14510, %r14456; + mov.b32 %r14511, %r14456; + mov.b32 %r14512, %r14456; + mov.b32 %r14513, %r14456; + mov.b32 %r14514, %r14456; + mov.b32 %r14515, %r14456; + mov.b32 %r14516, %r14456; + mov.b32 %r14517, %r14456; + mov.b32 %r14518, %r14456; + mov.b32 %r14519, %r14456; +$L__BB0_3: // %__nv_exp2f.exit1434 + // =>This Inner Loop Header: Depth=1 + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p67, %r14520, %r97; + setp.lt.s32 %p65, %r14520, %r98; + add.s32 %r4286, %r14450, 1; + setp.gt.s32 %p68, %r4286, 2; + selp.b32 %r14450, 0, %r4286, %p68; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p69, %r14535, %r2359; + setp.lt.s32 %p70, %r14536, %r2359; + setp.lt.s32 %p71, %r14533, %r2359; + setp.lt.s32 %p72, %r14534, %r2359; + setp.lt.s32 %p73, %r14531, %r2359; + setp.lt.s32 %p74, %r14532, %r2359; + setp.lt.s32 %p75, %r14529, %r2359; + setp.lt.s32 %p76, %r14530, %r2359; + setp.lt.s32 %p77, %r14527, %r2359; + setp.lt.s32 %p78, %r14528, %r2359; + setp.lt.s32 %p79, %r14525, %r2359; + setp.lt.s32 %p80, %r14526, %r2359; + setp.lt.s32 %p81, %r14523, %r2359; + setp.lt.s32 %p82, %r14524, %r2359; + setp.lt.s32 %p83, %r14521, %r2359; + setp.lt.s32 %p84, %r14522, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r4287, %r14450, 14; + add.s32 %r3181, %r2590, %r4287; + .loc 1 459 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:459:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shfl.sync.idx.b32 %r4289, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4290, %r4289, 11; + and.b32 %r4291, %r4290, 8192; + add.s32 %r3140, %r2590, 98304; + add.s32 %r4292, %r4291, %r3140; + bfe.u32 %r4293, %r4292, 4, 14; + cvt.u64.u32 %rd347, %r4293; + or.b64 %rd297, %rd347, 4611686293372403712; + bfe.u32 %r4294, %r3181, 4, 14; + cvt.u64.u32 %rd348, %r4294; + or.b64 %rd298, %rd348, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd297, %rd298, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r4295, %r4291, 32; + add.s32 %r4296, %r4295, %r3140; + bfe.u32 %r4297, %r4296, 4, 14; + cvt.u64.u32 %rd349, %r4297; + or.b64 %rd299, %rd349, 4611686293372403712; + add.s32 %r4298, %r3181, 32; + bfe.u32 %r4299, %r4298, 4, 14; + cvt.u64.u32 %rd350, %r4299; + or.b64 %rd300, %rd350, 4611686293338849280; + mov.pred %p47, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd299, %rd300, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4300, %r4291, 64; + add.s32 %r4301, %r4300, %r3140; + bfe.u32 %r4302, %r4301, 4, 14; + cvt.u64.u32 %rd351, %r4302; + or.b64 %rd301, %rd351, 4611686293372403712; + add.s32 %r4303, %r3181, 64; + bfe.u32 %r4304, %r4303, 4, 14; + cvt.u64.u32 %rd352, %r4304; + or.b64 %rd302, %rd352, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd301, %rd302, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4305, %r4291, 96; + add.s32 %r4306, %r4305, %r3140; + bfe.u32 %r4307, %r4306, 4, 14; + cvt.u64.u32 %rd353, %r4307; + or.b64 %rd303, %rd353, 4611686293372403712; + add.s32 %r4308, %r3181, 96; + bfe.u32 %r4309, %r4308, 4, 14; + cvt.u64.u32 %rd354, %r4309; + or.b64 %rd304, %rd354, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd303, %rd304, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4310, %r4291, 16384; + add.s32 %r4311, %r4310, %r3140; + bfe.u32 %r4312, %r4311, 4, 14; + cvt.u64.u32 %rd355, %r4312; + or.b64 %rd305, %rd355, 4611686293372403712; + add.s32 %r4313, %r3181, 8192; + bfe.u32 %r4314, %r4313, 4, 14; + cvt.u64.u32 %rd356, %r4314; + or.b64 %rd306, %rd356, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd305, %rd306, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4315, %r4291, 16416; + add.s32 %r4316, %r4315, %r3140; + bfe.u32 %r4317, %r4316, 4, 14; + cvt.u64.u32 %rd357, %r4317; + or.b64 %rd307, %rd357, 4611686293372403712; + add.s32 %r4318, %r3181, 8224; + bfe.u32 %r4319, %r4318, 4, 14; + cvt.u64.u32 %rd358, %r4319; + or.b64 %rd308, %rd358, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd307, %rd308, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4320, %r4291, 16448; + add.s32 %r4321, %r4320, %r3140; + bfe.u32 %r4322, %r4321, 4, 14; + cvt.u64.u32 %rd359, %r4322; + or.b64 %rd309, %rd359, 4611686293372403712; + add.s32 %r4323, %r3181, 8256; + bfe.u32 %r4324, %r4323, 4, 14; + cvt.u64.u32 %rd360, %r4324; + or.b64 %rd310, %rd360, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd309, %rd310, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4325, %r4291, 16480; + add.s32 %r4326, %r4325, %r3140; + bfe.u32 %r4327, %r4326, 4, 14; + cvt.u64.u32 %rd361, %r4327; + or.b64 %rd311, %rd361, 4611686293372403712; + add.s32 %r4328, %r3181, 8288; + bfe.u32 %r4329, %r4328, 4, 14; + cvt.u64.u32 %rd362, %r4329; + or.b64 %rd312, %rd362, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd311, %rd312, %p47, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3697, 0; + mov.b32 %r3141, %r3697; + mov.b32 %r3142, %r3697; + mov.b32 %r3144, %r3697; + mov.b32 %r3145, %r3697; + mov.b32 %r3143, %r3181; + // begin inline asm + // wait for regs: %r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755,%r3140,%r3141,%r3142,%r3143,%r3144,%r3145 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:461:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4330, %r2724, 0f3DB504F3; + mul.f32 %r4331, %r2725, 0f3DB504F3; + mul.f32 %r4332, %r2726, 0f3DB504F3; + mul.f32 %r4333, %r2727, 0f3DB504F3; + mul.f32 %r4334, %r2728, 0f3DB504F3; + mul.f32 %r4335, %r2729, 0f3DB504F3; + mul.f32 %r4336, %r2730, 0f3DB504F3; + mul.f32 %r4337, %r2731, 0f3DB504F3; + mul.f32 %r4338, %r2732, 0f3DB504F3; + mul.f32 %r4339, %r2733, 0f3DB504F3; + mul.f32 %r4340, %r2734, 0f3DB504F3; + mul.f32 %r4341, %r2735, 0f3DB504F3; + mul.f32 %r4342, %r2736, 0f3DB504F3; + mul.f32 %r4343, %r2737, 0f3DB504F3; + mul.f32 %r4344, %r2738, 0f3DB504F3; + mul.f32 %r4345, %r2739, 0f3DB504F3; + mul.f32 %r4346, %r2740, 0f3DB504F3; + mul.f32 %r4347, %r2741, 0f3DB504F3; + mul.f32 %r4348, %r2742, 0f3DB504F3; + mul.f32 %r4349, %r2743, 0f3DB504F3; + mul.f32 %r4350, %r2744, 0f3DB504F3; + mul.f32 %r4351, %r2745, 0f3DB504F3; + mul.f32 %r4352, %r2746, 0f3DB504F3; + mul.f32 %r4353, %r2747, 0f3DB504F3; + mul.f32 %r4354, %r2748, 0f3DB504F3; + mul.f32 %r4355, %r2749, 0f3DB504F3; + mul.f32 %r4356, %r2750, 0f3DB504F3; + mul.f32 %r4357, %r2751, 0f3DB504F3; + mul.f32 %r4358, %r2752, 0f3DB504F3; + mul.f32 %r4359, %r2753, 0f3DB504F3; + mul.f32 %r4360, %r2754, 0f3DB504F3; + mul.f32 %r4361, %r2755, 0f3DB504F3; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4362, %r14536, %r2359; + rem.s32 %r4363, %r14535, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p85, %r4363, %r100; + setp.le.s32 %p86, %r4362, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p87, %p1, %p86; + and.pred %p88, %p1, %p85; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p89, %r4362, %r2366; + setp.ge.s32 %p90, %r4363, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4364, %r4363, %r2366; + rem.s32 %r4365, %r4362, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p91, %r4365, 0; + setp.ne.b32 %p92, %r4364, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4366, %r4364, %r2366; + xor.b32 %r4367, %r4365, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4368, %r4366, 31; + and.b32 %r4369, %r4368, %r2366; + selp.b32 %r4370, %r4369, 0, %p92; + shr.s32 %r4371, %r4367, 31; + and.b32 %r4372, %r4371, %r2366; + selp.b32 %r4373, %r4372, 0, %p91; + add.s32 %r4374, %r4373, %r4365; + add.s32 %r4375, %r4370, %r4364; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd363, %r4375; + cvt.s64.s32 %rd364, %r4374; + setp.gt.s64 %p93, %rd17, %rd364; + setp.gt.s64 %p94, %rd17, %rd363; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p95, %p90, %p94; + and.pred %p96, %p89, %p93; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4376, %r4362, %r100; + sub.s32 %r4377, %r4363, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4378, %r4377, %r2366; + rem.s32 %r4379, %r4376, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p97, %r4379, 0; + setp.ne.b32 %p98, %r4378, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4380, %r4379, %r2366; + xor.b32 %r4381, %r4378, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4382, %r4381, 31; + and.b32 %r4383, %r4382, %r2366; + selp.b32 %r4384, %r4383, 0, %p98; + shr.s32 %r4385, %r4380, 31; + and.b32 %r4386, %r4385, %r2366; + selp.b32 %r4387, %r4386, 0, %p97; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4388, %r4387; + neg.s32 %r4389, %r4384; + setp.eq.b32 %p99, %r4378, %r4389; + setp.eq.b32 %p100, %r4379, %r4388; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p101, %p96, %p100; + and.pred %p102, %p95, %p99; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p103, %p88, %p102; + or.pred %p104, %p87, %p101; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p105, %r4363, %r106; + setp.le.s32 %p106, %r4362, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p107, %p3, %p106; + and.pred %p108, %p3, %p105; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4390, %r4362, %r106; + sub.s32 %r4391, %r4363, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4392, %r4391, %r2366; + rem.s32 %r4393, %r4390, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p109, %r4393, 0; + setp.ne.b32 %p110, %r4392, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4394, %r4393, %r2366; + xor.b32 %r4395, %r4392, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4396, %r4395, 31; + and.b32 %r4397, %r4396, %r2366; + selp.b32 %r4398, %r4397, 0, %p110; + shr.s32 %r4399, %r4394, 31; + and.b32 %r4400, %r4399, %r2366; + selp.b32 %r4401, %r4400, 0, %p109; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4402, %r4401; + neg.s32 %r4403, %r4398; + setp.eq.b32 %p111, %r4392, %r4403; + setp.eq.b32 %p112, %r4393, %r4402; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p113, %p96, %p112; + and.pred %p114, %p95, %p111; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p115, %p108, %p114; + or.pred %p116, %p107, %p113; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p117, %p116, %p70; + and.pred %p118, %p115, %p69; + and.pred %p119, %p104, %p70; + and.pred %p120, %p103, %p69; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4404, %r14534, %r2359; + rem.s32 %r4405, %r14533, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p121, %r4405, %r100; + setp.le.s32 %p122, %r4404, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p123, %p1, %p122; + and.pred %p124, %p1, %p121; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p125, %r4404, %r2366; + setp.ge.s32 %p126, %r4405, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4406, %r4405, %r2366; + rem.s32 %r4407, %r4404, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p127, %r4407, 0; + setp.ne.b32 %p128, %r4406, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4408, %r4406, %r2366; + xor.b32 %r4409, %r4407, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4410, %r4408, 31; + and.b32 %r4411, %r4410, %r2366; + selp.b32 %r4412, %r4411, 0, %p128; + shr.s32 %r4413, %r4409, 31; + and.b32 %r4414, %r4413, %r2366; + selp.b32 %r4415, %r4414, 0, %p127; + add.s32 %r4416, %r4415, %r4407; + add.s32 %r4417, %r4412, %r4406; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd365, %r4417; + cvt.s64.s32 %rd366, %r4416; + setp.gt.s64 %p129, %rd17, %rd366; + setp.gt.s64 %p130, %rd17, %rd365; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p131, %p126, %p130; + and.pred %p132, %p125, %p129; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4418, %r4404, %r100; + sub.s32 %r4419, %r4405, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4420, %r4419, %r2366; + rem.s32 %r4421, %r4418, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p133, %r4421, 0; + setp.ne.b32 %p134, %r4420, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4422, %r4421, %r2366; + xor.b32 %r4423, %r4420, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4424, %r4423, 31; + and.b32 %r4425, %r4424, %r2366; + selp.b32 %r4426, %r4425, 0, %p134; + shr.s32 %r4427, %r4422, 31; + and.b32 %r4428, %r4427, %r2366; + selp.b32 %r4429, %r4428, 0, %p133; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4430, %r4429; + neg.s32 %r4431, %r4426; + setp.eq.b32 %p135, %r4420, %r4431; + setp.eq.b32 %p136, %r4421, %r4430; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p137, %p132, %p136; + and.pred %p138, %p131, %p135; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p139, %p124, %p138; + or.pred %p140, %p123, %p137; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p141, %r4405, %r106; + setp.le.s32 %p142, %r4404, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p143, %p3, %p142; + and.pred %p144, %p3, %p141; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4432, %r4404, %r106; + sub.s32 %r4433, %r4405, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4434, %r4433, %r2366; + rem.s32 %r4435, %r4432, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p145, %r4435, 0; + setp.ne.b32 %p146, %r4434, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4436, %r4435, %r2366; + xor.b32 %r4437, %r4434, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4438, %r4437, 31; + and.b32 %r4439, %r4438, %r2366; + selp.b32 %r4440, %r4439, 0, %p146; + shr.s32 %r4441, %r4436, 31; + and.b32 %r4442, %r4441, %r2366; + selp.b32 %r4443, %r4442, 0, %p145; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4444, %r4443; + neg.s32 %r4445, %r4440; + setp.eq.b32 %p147, %r4434, %r4445; + setp.eq.b32 %p148, %r4435, %r4444; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p149, %p132, %p148; + and.pred %p150, %p131, %p147; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p151, %p144, %p150; + or.pred %p152, %p143, %p149; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p153, %p152, %p72; + and.pred %p154, %p151, %p71; + and.pred %p155, %p140, %p72; + and.pred %p156, %p139, %p71; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4446, %r14532, %r2359; + rem.s32 %r4447, %r14531, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p157, %r4447, %r100; + setp.le.s32 %p158, %r4446, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p159, %p1, %p158; + and.pred %p160, %p1, %p157; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p161, %r4446, %r2366; + setp.ge.s32 %p162, %r4447, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4448, %r4447, %r2366; + rem.s32 %r4449, %r4446, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p163, %r4449, 0; + setp.ne.b32 %p164, %r4448, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4450, %r4448, %r2366; + xor.b32 %r4451, %r4449, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4452, %r4450, 31; + and.b32 %r4453, %r4452, %r2366; + selp.b32 %r4454, %r4453, 0, %p164; + shr.s32 %r4455, %r4451, 31; + and.b32 %r4456, %r4455, %r2366; + selp.b32 %r4457, %r4456, 0, %p163; + add.s32 %r4458, %r4457, %r4449; + add.s32 %r4459, %r4454, %r4448; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd367, %r4459; + cvt.s64.s32 %rd368, %r4458; + setp.gt.s64 %p165, %rd17, %rd368; + setp.gt.s64 %p166, %rd17, %rd367; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p167, %p162, %p166; + and.pred %p168, %p161, %p165; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4460, %r4446, %r100; + sub.s32 %r4461, %r4447, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4462, %r4461, %r2366; + rem.s32 %r4463, %r4460, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p169, %r4463, 0; + setp.ne.b32 %p170, %r4462, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4464, %r4463, %r2366; + xor.b32 %r4465, %r4462, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4466, %r4465, 31; + and.b32 %r4467, %r4466, %r2366; + selp.b32 %r4468, %r4467, 0, %p170; + shr.s32 %r4469, %r4464, 31; + and.b32 %r4470, %r4469, %r2366; + selp.b32 %r4471, %r4470, 0, %p169; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4472, %r4471; + neg.s32 %r4473, %r4468; + setp.eq.b32 %p171, %r4462, %r4473; + setp.eq.b32 %p172, %r4463, %r4472; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p173, %p168, %p172; + and.pred %p174, %p167, %p171; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p175, %p160, %p174; + or.pred %p176, %p159, %p173; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p177, %r4447, %r106; + setp.le.s32 %p178, %r4446, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p179, %p3, %p178; + and.pred %p180, %p3, %p177; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4474, %r4446, %r106; + sub.s32 %r4475, %r4447, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4476, %r4475, %r2366; + rem.s32 %r4477, %r4474, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p181, %r4477, 0; + setp.ne.b32 %p182, %r4476, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4478, %r4477, %r2366; + xor.b32 %r4479, %r4476, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4480, %r4479, 31; + and.b32 %r4481, %r4480, %r2366; + selp.b32 %r4482, %r4481, 0, %p182; + shr.s32 %r4483, %r4478, 31; + and.b32 %r4484, %r4483, %r2366; + selp.b32 %r4485, %r4484, 0, %p181; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4486, %r4485; + neg.s32 %r4487, %r4482; + setp.eq.b32 %p183, %r4476, %r4487; + setp.eq.b32 %p184, %r4477, %r4486; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p185, %p168, %p184; + and.pred %p186, %p167, %p183; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p187, %p180, %p186; + or.pred %p188, %p179, %p185; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p189, %p188, %p74; + and.pred %p190, %p187, %p73; + and.pred %p191, %p176, %p74; + and.pred %p192, %p175, %p73; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4488, %r14530, %r2359; + rem.s32 %r4489, %r14529, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p193, %r4489, %r100; + setp.le.s32 %p194, %r4488, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p195, %p1, %p194; + and.pred %p196, %p1, %p193; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p197, %r4488, %r2366; + setp.ge.s32 %p198, %r4489, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4490, %r4489, %r2366; + rem.s32 %r4491, %r4488, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p199, %r4491, 0; + setp.ne.b32 %p200, %r4490, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4492, %r4490, %r2366; + xor.b32 %r4493, %r4491, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4494, %r4492, 31; + and.b32 %r4495, %r4494, %r2366; + selp.b32 %r4496, %r4495, 0, %p200; + shr.s32 %r4497, %r4493, 31; + and.b32 %r4498, %r4497, %r2366; + selp.b32 %r4499, %r4498, 0, %p199; + add.s32 %r4500, %r4499, %r4491; + add.s32 %r4501, %r4496, %r4490; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd369, %r4501; + cvt.s64.s32 %rd370, %r4500; + setp.gt.s64 %p201, %rd17, %rd370; + setp.gt.s64 %p202, %rd17, %rd369; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p203, %p198, %p202; + and.pred %p204, %p197, %p201; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4502, %r4488, %r100; + sub.s32 %r4503, %r4489, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4504, %r4503, %r2366; + rem.s32 %r4505, %r4502, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p205, %r4505, 0; + setp.ne.b32 %p206, %r4504, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4506, %r4505, %r2366; + xor.b32 %r4507, %r4504, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4508, %r4507, 31; + and.b32 %r4509, %r4508, %r2366; + selp.b32 %r4510, %r4509, 0, %p206; + shr.s32 %r4511, %r4506, 31; + and.b32 %r4512, %r4511, %r2366; + selp.b32 %r4513, %r4512, 0, %p205; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4514, %r4513; + neg.s32 %r4515, %r4510; + setp.eq.b32 %p207, %r4504, %r4515; + setp.eq.b32 %p208, %r4505, %r4514; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p209, %p204, %p208; + and.pred %p210, %p203, %p207; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p211, %p196, %p210; + or.pred %p212, %p195, %p209; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p213, %r4489, %r106; + setp.le.s32 %p214, %r4488, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p215, %p3, %p214; + and.pred %p216, %p3, %p213; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4516, %r4488, %r106; + sub.s32 %r4517, %r4489, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4518, %r4517, %r2366; + rem.s32 %r4519, %r4516, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p217, %r4519, 0; + setp.ne.b32 %p218, %r4518, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4520, %r4519, %r2366; + xor.b32 %r4521, %r4518, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4522, %r4521, 31; + and.b32 %r4523, %r4522, %r2366; + selp.b32 %r4524, %r4523, 0, %p218; + shr.s32 %r4525, %r4520, 31; + and.b32 %r4526, %r4525, %r2366; + selp.b32 %r4527, %r4526, 0, %p217; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4528, %r4527; + neg.s32 %r4529, %r4524; + setp.eq.b32 %p219, %r4518, %r4529; + setp.eq.b32 %p220, %r4519, %r4528; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p221, %p204, %p220; + and.pred %p222, %p203, %p219; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p223, %p216, %p222; + or.pred %p224, %p215, %p221; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p225, %p224, %p76; + and.pred %p226, %p223, %p75; + and.pred %p227, %p212, %p76; + and.pred %p228, %p211, %p75; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4530, %r14528, %r2359; + rem.s32 %r4531, %r14527, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p229, %r4531, %r100; + setp.le.s32 %p230, %r4530, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p231, %p1, %p230; + and.pred %p232, %p1, %p229; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p233, %r4530, %r2366; + setp.ge.s32 %p234, %r4531, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4532, %r4531, %r2366; + rem.s32 %r4533, %r4530, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p235, %r4533, 0; + setp.ne.b32 %p236, %r4532, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4534, %r4532, %r2366; + xor.b32 %r4535, %r4533, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4536, %r4534, 31; + and.b32 %r4537, %r4536, %r2366; + selp.b32 %r4538, %r4537, 0, %p236; + shr.s32 %r4539, %r4535, 31; + and.b32 %r4540, %r4539, %r2366; + selp.b32 %r4541, %r4540, 0, %p235; + add.s32 %r4542, %r4541, %r4533; + add.s32 %r4543, %r4538, %r4532; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd371, %r4543; + cvt.s64.s32 %rd372, %r4542; + setp.gt.s64 %p237, %rd17, %rd372; + setp.gt.s64 %p238, %rd17, %rd371; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p239, %p234, %p238; + and.pred %p240, %p233, %p237; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4544, %r4530, %r100; + sub.s32 %r4545, %r4531, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4546, %r4545, %r2366; + rem.s32 %r4547, %r4544, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p241, %r4547, 0; + setp.ne.b32 %p242, %r4546, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4548, %r4547, %r2366; + xor.b32 %r4549, %r4546, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4550, %r4549, 31; + and.b32 %r4551, %r4550, %r2366; + selp.b32 %r4552, %r4551, 0, %p242; + shr.s32 %r4553, %r4548, 31; + and.b32 %r4554, %r4553, %r2366; + selp.b32 %r4555, %r4554, 0, %p241; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4556, %r4555; + neg.s32 %r4557, %r4552; + setp.eq.b32 %p243, %r4546, %r4557; + setp.eq.b32 %p244, %r4547, %r4556; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p245, %p240, %p244; + and.pred %p246, %p239, %p243; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p247, %p232, %p246; + or.pred %p248, %p231, %p245; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p249, %r4531, %r106; + setp.le.s32 %p250, %r4530, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p251, %p3, %p250; + and.pred %p252, %p3, %p249; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4558, %r4530, %r106; + sub.s32 %r4559, %r4531, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4560, %r4559, %r2366; + rem.s32 %r4561, %r4558, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p253, %r4561, 0; + setp.ne.b32 %p254, %r4560, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4562, %r4561, %r2366; + xor.b32 %r4563, %r4560, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4564, %r4563, 31; + and.b32 %r4565, %r4564, %r2366; + selp.b32 %r4566, %r4565, 0, %p254; + shr.s32 %r4567, %r4562, 31; + and.b32 %r4568, %r4567, %r2366; + selp.b32 %r4569, %r4568, 0, %p253; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4570, %r4569; + neg.s32 %r4571, %r4566; + setp.eq.b32 %p255, %r4560, %r4571; + setp.eq.b32 %p256, %r4561, %r4570; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p257, %p240, %p256; + and.pred %p258, %p239, %p255; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p259, %p252, %p258; + or.pred %p260, %p251, %p257; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p261, %p260, %p78; + and.pred %p262, %p259, %p77; + and.pred %p263, %p248, %p78; + and.pred %p264, %p247, %p77; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4572, %r14526, %r2359; + rem.s32 %r4573, %r14525, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p265, %r4573, %r100; + setp.le.s32 %p266, %r4572, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p267, %p1, %p266; + and.pred %p268, %p1, %p265; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p269, %r4572, %r2366; + setp.ge.s32 %p270, %r4573, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4574, %r4573, %r2366; + rem.s32 %r4575, %r4572, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p271, %r4575, 0; + setp.ne.b32 %p272, %r4574, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4576, %r4574, %r2366; + xor.b32 %r4577, %r4575, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4578, %r4576, 31; + and.b32 %r4579, %r4578, %r2366; + selp.b32 %r4580, %r4579, 0, %p272; + shr.s32 %r4581, %r4577, 31; + and.b32 %r4582, %r4581, %r2366; + selp.b32 %r4583, %r4582, 0, %p271; + add.s32 %r4584, %r4583, %r4575; + add.s32 %r4585, %r4580, %r4574; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd373, %r4585; + cvt.s64.s32 %rd374, %r4584; + setp.gt.s64 %p273, %rd17, %rd374; + setp.gt.s64 %p274, %rd17, %rd373; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p275, %p270, %p274; + and.pred %p276, %p269, %p273; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4586, %r4572, %r100; + sub.s32 %r4587, %r4573, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4588, %r4587, %r2366; + rem.s32 %r4589, %r4586, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p277, %r4589, 0; + setp.ne.b32 %p278, %r4588, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4590, %r4589, %r2366; + xor.b32 %r4591, %r4588, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4592, %r4591, 31; + and.b32 %r4593, %r4592, %r2366; + selp.b32 %r4594, %r4593, 0, %p278; + shr.s32 %r4595, %r4590, 31; + and.b32 %r4596, %r4595, %r2366; + selp.b32 %r4597, %r4596, 0, %p277; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4598, %r4597; + neg.s32 %r4599, %r4594; + setp.eq.b32 %p279, %r4588, %r4599; + setp.eq.b32 %p280, %r4589, %r4598; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p281, %p276, %p280; + and.pred %p282, %p275, %p279; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p283, %p268, %p282; + or.pred %p284, %p267, %p281; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p285, %r4573, %r106; + setp.le.s32 %p286, %r4572, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p287, %p3, %p286; + and.pred %p288, %p3, %p285; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4600, %r4572, %r106; + sub.s32 %r4601, %r4573, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4602, %r4601, %r2366; + rem.s32 %r4603, %r4600, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p289, %r4603, 0; + setp.ne.b32 %p290, %r4602, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4604, %r4603, %r2366; + xor.b32 %r4605, %r4602, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4606, %r4605, 31; + and.b32 %r4607, %r4606, %r2366; + selp.b32 %r4608, %r4607, 0, %p290; + shr.s32 %r4609, %r4604, 31; + and.b32 %r4610, %r4609, %r2366; + selp.b32 %r4611, %r4610, 0, %p289; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4612, %r4611; + neg.s32 %r4613, %r4608; + setp.eq.b32 %p291, %r4602, %r4613; + setp.eq.b32 %p292, %r4603, %r4612; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p293, %p276, %p292; + and.pred %p294, %p275, %p291; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p295, %p288, %p294; + or.pred %p296, %p287, %p293; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p297, %p296, %p80; + and.pred %p298, %p295, %p79; + and.pred %p299, %p284, %p80; + and.pred %p300, %p283, %p79; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4614, %r14524, %r2359; + rem.s32 %r4615, %r14523, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p301, %r4615, %r100; + setp.le.s32 %p302, %r4614, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p303, %p1, %p302; + and.pred %p304, %p1, %p301; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p305, %r4614, %r2366; + setp.ge.s32 %p306, %r4615, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4616, %r4615, %r2366; + rem.s32 %r4617, %r4614, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p307, %r4617, 0; + setp.ne.b32 %p308, %r4616, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4618, %r4616, %r2366; + xor.b32 %r4619, %r4617, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4620, %r4618, 31; + and.b32 %r4621, %r4620, %r2366; + selp.b32 %r4622, %r4621, 0, %p308; + shr.s32 %r4623, %r4619, 31; + and.b32 %r4624, %r4623, %r2366; + selp.b32 %r4625, %r4624, 0, %p307; + add.s32 %r4626, %r4625, %r4617; + add.s32 %r4627, %r4622, %r4616; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd375, %r4627; + cvt.s64.s32 %rd376, %r4626; + setp.gt.s64 %p309, %rd17, %rd376; + setp.gt.s64 %p310, %rd17, %rd375; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p311, %p306, %p310; + and.pred %p312, %p305, %p309; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4628, %r4614, %r100; + sub.s32 %r4629, %r4615, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4630, %r4629, %r2366; + rem.s32 %r4631, %r4628, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p313, %r4631, 0; + setp.ne.b32 %p314, %r4630, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4632, %r4631, %r2366; + xor.b32 %r4633, %r4630, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4634, %r4633, 31; + and.b32 %r4635, %r4634, %r2366; + selp.b32 %r4636, %r4635, 0, %p314; + shr.s32 %r4637, %r4632, 31; + and.b32 %r4638, %r4637, %r2366; + selp.b32 %r4639, %r4638, 0, %p313; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4640, %r4639; + neg.s32 %r4641, %r4636; + setp.eq.b32 %p315, %r4630, %r4641; + setp.eq.b32 %p316, %r4631, %r4640; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p317, %p312, %p316; + and.pred %p318, %p311, %p315; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p319, %p304, %p318; + or.pred %p320, %p303, %p317; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p321, %r4615, %r106; + setp.le.s32 %p322, %r4614, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p323, %p3, %p322; + and.pred %p324, %p3, %p321; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4642, %r4614, %r106; + sub.s32 %r4643, %r4615, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4644, %r4643, %r2366; + rem.s32 %r4645, %r4642, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p325, %r4645, 0; + setp.ne.b32 %p326, %r4644, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4646, %r4645, %r2366; + xor.b32 %r4647, %r4644, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4648, %r4647, 31; + and.b32 %r4649, %r4648, %r2366; + selp.b32 %r4650, %r4649, 0, %p326; + shr.s32 %r4651, %r4646, 31; + and.b32 %r4652, %r4651, %r2366; + selp.b32 %r4653, %r4652, 0, %p325; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4654, %r4653; + neg.s32 %r4655, %r4650; + setp.eq.b32 %p327, %r4644, %r4655; + setp.eq.b32 %p328, %r4645, %r4654; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p329, %p312, %p328; + and.pred %p330, %p311, %p327; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p331, %p324, %p330; + or.pred %p332, %p323, %p329; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p333, %p332, %p82; + and.pred %p334, %p331, %p81; + and.pred %p335, %p320, %p82; + and.pred %p336, %p319, %p81; + .loc 1 798 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:798:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4656, %r14522, %r2359; + rem.s32 %r4657, %r14521, %r2359; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p337, %r4657, %r100; + setp.le.s32 %p338, %r4656, %r100; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p339, %p1, %p338; + and.pred %p340, %p1, %p337; + .loc 1 493 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:493:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ge.s32 %p341, %r4656, %r2366; + setp.ge.s32 %p342, %r4657, %r2366; + .loc 1 494 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:494:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4658, %r4657, %r2366; + rem.s32 %r4659, %r4656, %r2366; + .loc 1 496 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:496:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p343, %r4659, 0; + setp.ne.b32 %p344, %r4658, 0; + .loc 1 499 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:499:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4660, %r4658, %r2366; + xor.b32 %r4661, %r4659, %r2366; + .loc 1 502 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:502:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4662, %r4660, 31; + and.b32 %r4663, %r4662, %r2366; + selp.b32 %r4664, %r4663, 0, %p344; + shr.s32 %r4665, %r4661, 31; + and.b32 %r4666, %r4665, %r2366; + selp.b32 %r4667, %r4666, 0, %p343; + add.s32 %r4668, %r4667, %r4659; + add.s32 %r4669, %r4664, %r4658; + .loc 1 504 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:504:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.s64.s32 %rd377, %r4669; + cvt.s64.s32 %rd378, %r4668; + setp.gt.s64 %p345, %rd17, %rd378; + setp.gt.s64 %p346, %rd17, %rd377; + .loc 1 505 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:505:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p347, %p342, %p346; + and.pred %p348, %p341, %p345; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4670, %r4656, %r100; + sub.s32 %r4671, %r4657, %r100; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4672, %r4671, %r2366; + rem.s32 %r4673, %r4670, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p349, %r4673, 0; + setp.ne.b32 %p350, %r4672, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4674, %r4673, %r2366; + xor.b32 %r4675, %r4672, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4676, %r4675, 31; + and.b32 %r4677, %r4676, %r2366; + selp.b32 %r4678, %r4677, 0, %p350; + shr.s32 %r4679, %r4674, 31; + and.b32 %r4680, %r4679, %r2366; + selp.b32 %r4681, %r4680, 0, %p349; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4682, %r4681; + neg.s32 %r4683, %r4678; + setp.eq.b32 %p351, %r4672, %r4683; + setp.eq.b32 %p352, %r4673, %r4682; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p353, %p348, %p352; + and.pred %p354, %p347, %p351; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p355, %p340, %p354; + or.pred %p356, %p339, %p353; + .loc 1 482 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:482:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.le.s32 %p357, %r4657, %r106; + setp.le.s32 %p358, %r4656, %r106; + .loc 1 490 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:490:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p359, %p3, %p358; + and.pred %p360, %p3, %p357; + .loc 1 506 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:506:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4684, %r4656, %r106; + sub.s32 %r4685, %r4657, %r106; + .loc 1 507 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:507:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + rem.s32 %r4686, %r4685, %r2366; + rem.s32 %r4687, %r4684, %r2366; + .loc 1 508 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:508:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p361, %r4687, 0; + setp.ne.b32 %p362, %r4686, 0; + .loc 1 510 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:510:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4688, %r4687, %r2366; + xor.b32 %r4689, %r4686, %r2366; + .loc 1 513 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:513:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.s32 %r4690, %r4689, 31; + and.b32 %r4691, %r4690, %r2366; + selp.b32 %r4692, %r4691, 0, %p362; + shr.s32 %r4693, %r4688, 31; + and.b32 %r4694, %r4693, %r2366; + selp.b32 %r4695, %r4694, 0, %p361; + .loc 1 514 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:514:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + neg.s32 %r4696, %r4695; + neg.s32 %r4697, %r4692; + setp.eq.b32 %p363, %r4686, %r4697; + setp.eq.b32 %p364, %r4687, %r4696; + .loc 1 515 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:515:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p365, %p348, %p364; + and.pred %p366, %p347, %p363; + .loc 1 516 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:516:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + or.pred %p367, %p360, %p366; + or.pred %p368, %p359, %p365; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p369, %p368, %p84; + and.pred %p370, %p367, %p83; + and.pred %p371, %p356, %p84; + and.pred %p372, %p355, %p83; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4698, %r4330, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4699, %r4698, 0fFF800000, %p118; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4700, %r4331, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4701, %r4700, 0fFF800000, %p117; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4702, %r4332, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4703, %r4702, 0fFF800000, %p120; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4704, %r4333, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4705, %r4704, 0fFF800000, %p119; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4706, %r4334, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4707, %r4706, 0fFF800000, %p154; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4708, %r4335, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4709, %r4708, 0fFF800000, %p153; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4710, %r4336, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4711, %r4710, 0fFF800000, %p156; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4712, %r4337, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4713, %r4712, 0fFF800000, %p155; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4714, %r4338, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4715, %r4714, 0fFF800000, %p190; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4716, %r4339, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4717, %r4716, 0fFF800000, %p189; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4718, %r4340, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4719, %r4718, 0fFF800000, %p192; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4720, %r4341, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4721, %r4720, 0fFF800000, %p191; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4722, %r4342, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4723, %r4722, 0fFF800000, %p226; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4724, %r4343, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4725, %r4724, 0fFF800000, %p225; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4726, %r4344, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4727, %r4726, 0fFF800000, %p228; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4728, %r4345, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4729, %r4728, 0fFF800000, %p227; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4730, %r4346, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4731, %r4730, 0fFF800000, %p262; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4732, %r4347, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4733, %r4732, 0fFF800000, %p261; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4734, %r4348, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4735, %r4734, 0fFF800000, %p264; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4736, %r4349, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4737, %r4736, 0fFF800000, %p263; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4738, %r4350, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4739, %r4738, 0fFF800000, %p298; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4740, %r4351, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4741, %r4740, 0fFF800000, %p297; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4742, %r4352, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4743, %r4742, 0fFF800000, %p300; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4744, %r4353, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4745, %r4744, 0fFF800000, %p299; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4746, %r4354, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4747, %r4746, 0fFF800000, %p334; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4748, %r4355, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4749, %r4748, 0fFF800000, %p333; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4750, %r4356, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4751, %r4750, 0fFF800000, %p336; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4752, %r4357, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4753, %r4752, 0fFF800000, %p335; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4754, %r4358, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4755, %r4754, 0fFF800000, %p370; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4756, %r4359, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4757, %r4756, 0fFF800000, %p369; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4758, %r4360, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4759, %r4758, 0fFF800000, %p372; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4760, %r4361, 0f3FB8AA3B; + .loc 1 521 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:521:69 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.f32 %r4761, %r4760, 0fFF800000, %p371; + .loc 1 525 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:525:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4762, %r4699, %r35; + sub.f32 %r4763, %r4701, %r35; + sub.f32 %r4764, %r4703, %r36; + sub.f32 %r4765, %r4705, %r36; + sub.f32 %r4766, %r4707, %r35; + sub.f32 %r4767, %r4709, %r35; + sub.f32 %r4768, %r4711, %r36; + sub.f32 %r4769, %r4713, %r36; + sub.f32 %r4770, %r4715, %r35; + sub.f32 %r4771, %r4717, %r35; + sub.f32 %r4772, %r4719, %r36; + sub.f32 %r4773, %r4721, %r36; + sub.f32 %r4774, %r4723, %r35; + sub.f32 %r4775, %r4725, %r35; + sub.f32 %r4776, %r4727, %r36; + sub.f32 %r4777, %r4729, %r36; + sub.f32 %r4778, %r4731, %r35; + sub.f32 %r4779, %r4733, %r35; + sub.f32 %r4780, %r4735, %r36; + sub.f32 %r4781, %r4737, %r36; + sub.f32 %r4782, %r4739, %r35; + sub.f32 %r4783, %r4741, %r35; + sub.f32 %r4784, %r4743, %r36; + sub.f32 %r4785, %r4745, %r36; + sub.f32 %r4786, %r4747, %r35; + sub.f32 %r4787, %r4749, %r35; + sub.f32 %r4788, %r4751, %r36; + sub.f32 %r4789, %r4753, %r36; + sub.f32 %r4790, %r4755, %r35; + sub.f32 %r4791, %r4757, %r35; + sub.f32 %r4792, %r4759, %r36; + sub.f32 %r4793, %r4761, %r36; + .loc 1 525 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:525:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + ex2.approx.ftz.f32 %r4794, %r4762; + ex2.approx.ftz.f32 %r4795, %r4763; + ex2.approx.ftz.f32 %r4796, %r4764; + ex2.approx.ftz.f32 %r4797, %r4765; + ex2.approx.ftz.f32 %r4798, %r4766; + ex2.approx.ftz.f32 %r4799, %r4767; + ex2.approx.ftz.f32 %r4800, %r4768; + ex2.approx.ftz.f32 %r4801, %r4769; + ex2.approx.ftz.f32 %r4802, %r4770; + ex2.approx.ftz.f32 %r4803, %r4771; + ex2.approx.ftz.f32 %r4804, %r4772; + ex2.approx.ftz.f32 %r4805, %r4773; + ex2.approx.ftz.f32 %r4806, %r4774; + ex2.approx.ftz.f32 %r4807, %r4775; + ex2.approx.ftz.f32 %r4808, %r4776; + ex2.approx.ftz.f32 %r4809, %r4777; + ex2.approx.ftz.f32 %r4810, %r4778; + ex2.approx.ftz.f32 %r4811, %r4779; + ex2.approx.ftz.f32 %r4812, %r4780; + ex2.approx.ftz.f32 %r4813, %r4781; + ex2.approx.ftz.f32 %r4814, %r4782; + ex2.approx.ftz.f32 %r4815, %r4783; + ex2.approx.ftz.f32 %r4816, %r4784; + ex2.approx.ftz.f32 %r4817, %r4785; + ex2.approx.ftz.f32 %r4818, %r4786; + ex2.approx.ftz.f32 %r4819, %r4787; + ex2.approx.ftz.f32 %r4820, %r4788; + ex2.approx.ftz.f32 %r4821, %r4789; + ex2.approx.ftz.f32 %r4822, %r4790; + ex2.approx.ftz.f32 %r4823, %r4791; + ex2.approx.ftz.f32 %r4824, %r4792; + ex2.approx.ftz.f32 %r4825, %r4793; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r4826, %r2590, 49152; + add.s32 %r3699, %r4826, %r4287; + .loc 1 530 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:530:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + wgmma.fence.sync.aligned; + add.s32 %r3696, %r2590, 131072; + add.s32 %r4827, %r4291, %r3696; + bfe.u32 %r4828, %r4827, 4, 14; + cvt.u64.u32 %rd379, %r4828; + or.b64 %rd313, %rd379, 4611686293372403712; + bfe.u32 %r4829, %r3699, 4, 14; + cvt.u64.u32 %rd380, %r4829; + or.b64 %rd314, %rd380, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd313, %rd314, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4830, %r4295, %r3696; + bfe.u32 %r4831, %r4830, 4, 14; + cvt.u64.u32 %rd381, %r4831; + or.b64 %rd315, %rd381, 4611686293372403712; + add.s32 %r4832, %r3699, 32; + bfe.u32 %r4833, %r4832, 4, 14; + cvt.u64.u32 %rd382, %r4833; + or.b64 %rd316, %rd382, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd315, %rd316, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4834, %r4300, %r3696; + bfe.u32 %r4835, %r4834, 4, 14; + cvt.u64.u32 %rd383, %r4835; + or.b64 %rd317, %rd383, 4611686293372403712; + add.s32 %r4836, %r3699, 64; + bfe.u32 %r4837, %r4836, 4, 14; + cvt.u64.u32 %rd384, %r4837; + or.b64 %rd318, %rd384, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd317, %rd318, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4838, %r4305, %r3696; + bfe.u32 %r4839, %r4838, 4, 14; + cvt.u64.u32 %rd385, %r4839; + or.b64 %rd319, %rd385, 4611686293372403712; + add.s32 %r4840, %r3699, 96; + bfe.u32 %r4841, %r4840, 4, 14; + cvt.u64.u32 %rd386, %r4841; + or.b64 %rd320, %rd386, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd319, %rd320, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4842, %r4310, %r3696; + bfe.u32 %r4843, %r4842, 4, 14; + cvt.u64.u32 %rd387, %r4843; + or.b64 %rd321, %rd387, 4611686293372403712; + add.s32 %r4844, %r3699, 8192; + bfe.u32 %r4845, %r4844, 4, 14; + cvt.u64.u32 %rd388, %r4845; + or.b64 %rd322, %rd388, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd321, %rd322, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4846, %r4315, %r3696; + bfe.u32 %r4847, %r4846, 4, 14; + cvt.u64.u32 %rd389, %r4847; + or.b64 %rd323, %rd389, 4611686293372403712; + add.s32 %r4848, %r3699, 8224; + bfe.u32 %r4849, %r4848, 4, 14; + cvt.u64.u32 %rd390, %r4849; + or.b64 %rd324, %rd390, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd323, %rd324, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4850, %r4320, %r3696; + bfe.u32 %r4851, %r4850, 4, 14; + cvt.u64.u32 %rd391, %r4851; + or.b64 %rd325, %rd391, 4611686293372403712; + add.s32 %r4852, %r3699, 8256; + bfe.u32 %r4853, %r4852, 4, 14; + cvt.u64.u32 %rd392, %r4853; + or.b64 %rd326, %rd392, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd325, %rd326, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4854, %r4325, %r3696; + bfe.u32 %r4855, %r4854, 4, 14; + cvt.u64.u32 %rd393, %r4855; + or.b64 %rd327, %rd393, 4611686293372403712; + add.s32 %r4856, %r3699, 8288; + bfe.u32 %r4857, %r4856, 4, 14; + cvt.u64.u32 %rd394, %r4857; + or.b64 %rd328, %rd394, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd327, %rd328, %p47, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3698, %r3697; + mov.b32 %r3700, %r3697; + mov.b32 %r3701, %r3697; + // begin inline asm + // wait for regs: %r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311,%r3696,%r3697,%r3698,%r3699,%r3700,%r3701 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mov.b64 {%r4858, %r4859}, %rd29; + sub.f32 %r4860, %r3280, %r4858; + sub.f32 %r4861, %r3281, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4862, %r4795, %r4861; + mul.f32 %r4863, %r4794, %r4860; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs1, %r4863; + cvt.rn.bf16.f32 %rs2, %r4862; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs3, %rs2, 0x0000, %p117; + selp.b16 %rs4, %rs1, 0x0000, %p118; + mov.b32 %r3868, {%rs4, %rs3}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mov.b64 {%r4864, %r4865}, %rd28; + sub.f32 %r4866, %r3282, %r4864; + sub.f32 %r4867, %r3283, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4868, %r4797, %r4867; + mul.f32 %r4869, %r4796, %r4866; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs5, %r4869; + cvt.rn.bf16.f32 %rs6, %r4868; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs7, %rs6, 0x0000, %p119; + selp.b16 %rs8, %rs5, 0x0000, %p120; + mov.b32 %r3869, {%rs8, %rs7}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4870, %r3284, %r4858; + sub.f32 %r4871, %r3285, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4872, %r4799, %r4871; + mul.f32 %r4873, %r4798, %r4870; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs9, %r4873; + cvt.rn.bf16.f32 %rs10, %r4872; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs11, %rs10, 0x0000, %p153; + selp.b16 %rs12, %rs9, 0x0000, %p154; + mov.b32 %r3870, {%rs12, %rs11}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4874, %r3286, %r4864; + sub.f32 %r4875, %r3287, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4876, %r4801, %r4875; + mul.f32 %r4877, %r4800, %r4874; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs13, %r4877; + cvt.rn.bf16.f32 %rs14, %r4876; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs15, %rs14, 0x0000, %p155; + selp.b16 %rs16, %rs13, 0x0000, %p156; + mov.b32 %r3871, {%rs16, %rs15}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4878, %r3288, %r4858; + sub.f32 %r4879, %r3289, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4880, %r4803, %r4879; + mul.f32 %r4881, %r4802, %r4878; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs17, %r4881; + cvt.rn.bf16.f32 %rs18, %r4880; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs19, %rs18, 0x0000, %p189; + selp.b16 %rs20, %rs17, 0x0000, %p190; + mov.b32 %r4000, {%rs20, %rs19}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4882, %r3290, %r4864; + sub.f32 %r4883, %r3291, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4884, %r4805, %r4883; + mul.f32 %r4885, %r4804, %r4882; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs21, %r4885; + cvt.rn.bf16.f32 %rs22, %r4884; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs23, %rs22, 0x0000, %p191; + selp.b16 %rs24, %rs21, 0x0000, %p192; + mov.b32 %r4001, {%rs24, %rs23}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4886, %r3292, %r4858; + sub.f32 %r4887, %r3293, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4888, %r4807, %r4887; + mul.f32 %r4889, %r4806, %r4886; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs25, %r4889; + cvt.rn.bf16.f32 %rs26, %r4888; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs27, %rs26, 0x0000, %p225; + selp.b16 %rs28, %rs25, 0x0000, %p226; + mov.b32 %r4002, {%rs28, %rs27}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4890, %r3294, %r4864; + sub.f32 %r4891, %r3295, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4892, %r4809, %r4891; + mul.f32 %r4893, %r4808, %r4890; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs29, %r4893; + cvt.rn.bf16.f32 %rs30, %r4892; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs31, %rs30, 0x0000, %p227; + selp.b16 %rs32, %rs29, 0x0000, %p228; + mov.b32 %r4003, {%rs32, %rs31}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4894, %r3296, %r4858; + sub.f32 %r4895, %r3297, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4896, %r4811, %r4895; + mul.f32 %r4897, %r4810, %r4894; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs33, %r4897; + cvt.rn.bf16.f32 %rs34, %r4896; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs35, %rs34, 0x0000, %p261; + selp.b16 %rs36, %rs33, 0x0000, %p262; + mov.b32 %r4132, {%rs36, %rs35}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4898, %r3298, %r4864; + sub.f32 %r4899, %r3299, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4900, %r4813, %r4899; + mul.f32 %r4901, %r4812, %r4898; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs37, %r4901; + cvt.rn.bf16.f32 %rs38, %r4900; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs39, %rs38, 0x0000, %p263; + selp.b16 %rs40, %rs37, 0x0000, %p264; + mov.b32 %r4133, {%rs40, %rs39}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4902, %r3300, %r4858; + sub.f32 %r4903, %r3301, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4904, %r4815, %r4903; + mul.f32 %r4905, %r4814, %r4902; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs41, %r4905; + cvt.rn.bf16.f32 %rs42, %r4904; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs43, %rs42, 0x0000, %p297; + selp.b16 %rs44, %rs41, 0x0000, %p298; + mov.b32 %r4134, {%rs44, %rs43}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4906, %r3302, %r4864; + sub.f32 %r4907, %r3303, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4908, %r4817, %r4907; + mul.f32 %r4909, %r4816, %r4906; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs45, %r4909; + cvt.rn.bf16.f32 %rs46, %r4908; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs47, %rs46, 0x0000, %p299; + selp.b16 %rs48, %rs45, 0x0000, %p300; + mov.b32 %r4135, {%rs48, %rs47}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4910, %r3304, %r4858; + sub.f32 %r4911, %r3305, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4912, %r4819, %r4911; + mul.f32 %r4913, %r4818, %r4910; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs49, %r4913; + cvt.rn.bf16.f32 %rs50, %r4912; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs51, %rs50, 0x0000, %p333; + selp.b16 %rs52, %rs49, 0x0000, %p334; + mov.b32 %r4264, {%rs52, %rs51}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4914, %r3306, %r4864; + sub.f32 %r4915, %r3307, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4916, %r4821, %r4915; + mul.f32 %r4917, %r4820, %r4914; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs53, %r4917; + cvt.rn.bf16.f32 %rs54, %r4916; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs55, %rs54, 0x0000, %p335; + selp.b16 %rs56, %rs53, 0x0000, %p336; + mov.b32 %r4265, {%rs56, %rs55}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4918, %r3308, %r4858; + sub.f32 %r4919, %r3309, %r4859; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4920, %r4823, %r4919; + mul.f32 %r4921, %r4822, %r4918; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs57, %r4921; + cvt.rn.bf16.f32 %rs58, %r4920; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs59, %rs58, 0x0000, %p369; + selp.b16 %rs60, %rs57, 0x0000, %p370; + mov.b32 %r4266, {%rs60, %rs59}; + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.f32 %r4922, %r3310, %r4864; + sub.f32 %r4923, %r3311, %r4865; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.f32 %r4924, %r4825, %r4923; + mul.f32 %r4925, %r4824, %r4922; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + cvt.rn.bf16.f32 %rs61, %r4925; + cvt.rn.bf16.f32 %rs62, %r4924; + .loc 1 549 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:549:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + selp.b16 %rs63, %rs62, 0x0000, %p371; + selp.b16 %rs64, %rs61, 0x0000, %p372; + mov.b32 %r4267, {%rs64, %rs63}; + .loc 1 553 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:553:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r3868,%r3869,%r3870,%r3871}, %rd298, %p47, 1, 1, 1; + // end inline asm + add.s32 %r4926, %r3181, 2048; + bfe.u32 %r4927, %r4926, 4, 14; + cvt.u64.u32 %rd395, %r4927; + or.b64 %rd330, %rd395, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r4000,%r4001,%r4002,%r4003}, %rd330, %p47, 1, 1, 1; + // end inline asm + add.s32 %r4928, %r3181, 4096; + bfe.u32 %r4929, %r4928, 4, 14; + cvt.u64.u32 %rd396, %r4929; + or.b64 %rd331, %rd396, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r4132,%r4133,%r4134,%r4135}, %rd331, %p47, 1, 1, 1; + // end inline asm + add.s32 %r4930, %r3181, 6144; + bfe.u32 %r4931, %r4930, 4, 14; + cvt.u64.u32 %rd397, %r4931; + or.b64 %rd332, %rd397, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r4264,%r4265,%r4266,%r4267}, %rd332, %p47, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:417:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r14535, %r14449, %r14535; + add.s32 %r14536, %r14449, %r14536; + add.s32 %r14533, %r14449, %r14533; + add.s32 %r14534, %r14449, %r14534; + add.s32 %r14531, %r14449, %r14531; + add.s32 %r14532, %r14449, %r14532; + add.s32 %r14529, %r14449, %r14529; + add.s32 %r14530, %r14449, %r14530; + add.s32 %r14527, %r14449, %r14527; + add.s32 %r14528, %r14449, %r14528; + add.s32 %r14525, %r14449, %r14525; + add.s32 %r14526, %r14449, %r14526; + add.s32 %r14523, %r14449, %r14523; + add.s32 %r14524, %r14449, %r14524; + add.s32 %r14521, %r14449, %r14521; + add.s32 %r14522, %r14449, %r14522; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r277, %r14520, 1; + .loc 1 788 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:788:33 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shr.u32 %r4932, %r277, 1; + .loc 1 789 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mad.wide.u32 %rd334, %r4932, 4, %rd233; + .loc 1 789 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + // begin inline asm + mov.u64 %rd333, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd333, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4268, 0x0; + @%p65 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4268 }, [ %rd334 + 0 ], %rd333; + // end inline asm + .loc 1 790 109 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:109 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r4933, %r4932, 1; + .loc 1 790 113 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:113 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p373, %r4933, %r2514; + .loc 1 790 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:55 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd337, %rd334, 4; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.pred %p66, %p65, %p373; + .loc 1 790 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + // begin inline asm + mov.u64 %rd336, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd336, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4269, 0x0; + @%p66 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4269 }, [ %rd337 + 0 ], %rd336; + // end inline asm + .loc 1 791 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:791:35 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + and.b32 %r4934, %r14520, 1; + .loc 1 792 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:34 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + sub.s32 %r4935, %r4269, %r4268; + .loc 1 792 48 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:48 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r4936, %r4935, 7; + .loc 1 792 63 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:63 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r4937, %r4936, -64; + .loc 1 793 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + xor.b32 %r4938, %r4934, 1; + .loc 1 793 61 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:61 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r4939, %r4934, 6; + .loc 1 793 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:42 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mad.lo.s32 %r14449, %r4937, %r4938, %r4939; + .loc 1 414 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:414:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r4940, %r14449, 7; + .loc 1 414 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:414:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + mul.wide.s32 %rd398, %r4940, 2; + add.s64 %rd1184, %rd1184, %rd398; + add.s64 %rd1183, %rd1183, %rd398; + add.s64 %rd1182, %rd1182, %rd398; + add.s64 %rd1181, %rd1181, %rd398; + .loc 1 415 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:415:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s64 %rd1180, %rd1180, %rd398; + add.s64 %rd1179, %rd1179, %rd398; + add.s64 %rd1178, %rd1178, %rd398; + add.s64 %rd1177, %rd1177, %rd398; + .loc 1 417 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:417:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r14455, %r14449, %r14455; + add.s32 %r14454, %r14449, %r14454; + add.s32 %r14453, %r14449, %r14453; + add.s32 %r14452, %r14449, %r14452; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + add.s32 %r4941, %r14451, 1; + setp.gt.s32 %p374, %r4941, 2; + selp.b32 %r14451, 0, %r4941, %p374; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.lt.s32 %p375, %r14455, %r2359; + setp.lt.s32 %p376, %r14454, %r2359; + setp.lt.s32 %p377, %r14453, %r2359; + setp.lt.s32 %p378, %r14452, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + shl.b32 %r4942, %r14451, 14; + add.s32 %r4943, %r2590, %r4942; + bar.sync 0; + add.s32 %r4270, %r4943, %r60; + selp.b32 %r4944, 16, 0, %p375; + selp.b32 %r4279, %r4944, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4270 + 0 ], [ %rd1184 + 0 ], 0x10, %r4279; + // end inline asm + add.s32 %r4272, %r4270, 2048; + selp.b32 %r4945, 16, 0, %p376; + selp.b32 %r4281, %r4945, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4272 + 0 ], [ %rd1183 + 0 ], 0x10, %r4281; + // end inline asm + add.s32 %r4274, %r4270, 4096; + selp.b32 %r4946, 16, 0, %p377; + selp.b32 %r4283, %r4946, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4274 + 0 ], [ %rd1182 + 0 ], 0x10, %r4283; + // end inline asm + add.s32 %r4276, %r4270, 6144; + selp.b32 %r4947, 16, 0, %p378; + selp.b32 %r4285, %r4947, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4276 + 0 ], [ %rd1181 + 0 ], 0x10, %r4285; + // end inline asm + cp.async.commit_group; + add.s32 %r4948, %r4826, %r4942; + add.s32 %r4278, %r4948, %r60; + // begin inline asm + cp.async.cg.shared.global [ %r4278 + 0 ], [ %rd1180 + 0 ], 0x10, %r4279; + // end inline asm + add.s32 %r4280, %r4278, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4280 + 0 ], [ %rd1179 + 0 ], 0x10, %r4281; + // end inline asm + add.s32 %r4282, %r4278, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4282 + 0 ], [ %rd1178 + 0 ], 0x10, %r4283; + // end inline asm + add.s32 %r4284, %r4278, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4284 + 0 ], [ %rd1177 + 0 ], 0x10, %r4285; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + setp.ne.b32 %p379, %r99, %r277; + mov.b32 %r14520, %r277; + @%p379 bra $L__BB0_3; +$L__tmp28: +$L__BB0_4: // %._crit_edge + .loc 1 0 0 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:0 + add.s64 %rd4, %rd198, %rd253; +$L__tmp29: + cvt.s64.s32 %rd5, %r2571; + cvt.s64.s32 %rd6, %r2572; + cvt.s64.s32 %rd7, %r2573; + cvt.s64.s32 %rd8, %r2574; + cvt.s64.s32 %rd9, %r2575; + cvt.s64.s32 %rd10, %r2576; + cvt.s64.s32 %rd11, %r2577; + cvt.s64.s32 %rd12, %r2578; +$L__tmp30: + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:207:12 ] + // begin inline asm + // wait for regs: %r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp31: + .loc 1 214 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:214:39 + shl.b64 %rd417, %rd14, 2; + add.s64 %rd399, %rd204, %rd417; + .loc 1 215 31 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:215:31 + // begin inline asm + mov.u32 %r5077, 0x0; + ld.global.b32 { %r5077 }, [ %rd399 + 0 ]; + // end inline asm + .loc 1 215 45 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:215:45 + shl.b32 %r348, %r5077, 7; + .loc 1 216 62 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:216:62 + shl.b64 %rd418, %rd16, 2; + add.s64 %rd400, %rd203, %rd418; + .loc 1 216 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:216:43 + // begin inline asm + mov.u32 %r5078, 0x0; + ld.global.b32 { %r5078 }, [ %rd400 + 0 ]; + // end inline asm + .loc 1 218 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:218:33 + or.b32 %r5111, %r348, %r13; + or.b32 %r5112, %r348, %r14; + or.b32 %r5113, %r348, %r15; + or.b32 %r5114, %r348, %r16; +$L__tmp32: + .loc 1 390 37 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:390:37 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b32 %r5115, %r5111, 7; + shl.b32 %r5116, %r5112, 7; + shl.b32 %r5117, %r5113, 7; + shl.b32 %r5118, %r5114, 7; + .loc 1 390 18 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:390:18 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.wide.s32 %rd419, %r5115, 2; + add.s64 %rd420, %rd1, %rd419; + mul.wide.s32 %rd421, %r5116, 2; + add.s64 %rd422, %rd1, %rd421; + mul.wide.s32 %rd423, %r5117, 2; + add.s64 %rd424, %rd1, %rd423; + mul.wide.s32 %rd425, %r5118, 2; + add.s64 %rd426, %rd1, %rd425; + .loc 1 390 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:390:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b64 %rd427, %rd13, 1; + add.s64 %rd401, %rd420, %rd427; + add.s64 %rd402, %rd422, %rd427; + add.s64 %rd403, %rd424, %rd427; + add.s64 %rd404, %rd426, %rd427; + .loc 1 391 18 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:391:18 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s64 %rd428, %rd2, %rd419; + add.s64 %rd429, %rd2, %rd421; + add.s64 %rd430, %rd2, %rd423; + add.s64 %rd431, %rd2, %rd425; + .loc 1 391 49 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:391:49 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s64 %rd405, %rd428, %rd427; + add.s64 %rd406, %rd429, %rd427; + add.s64 %rd407, %rd430, %rd427; + add.s64 %rd408, %rd431, %rd427; + .loc 1 395 43 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:395:43 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b32 %r5119, %r5078, 1; + .loc 1 395 63 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:395:63 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + min.s32 %r350, %r5119, %r58; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.lt.s32 %p380, %r5119, 1; + setp.gt.s32 %p381, %r5119, 0; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.lt.s32 %p382, %r5111, %r2359; + setp.lt.s32 %p383, %r5112, %r2359; + setp.lt.s32 %p384, %r5113, %r2359; + setp.lt.s32 %p385, %r5114, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b32 %r5120, 16, 0, %p382; + selp.b32 %r5088, %r5120, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5079 + 0 ], [ %rd401 + 0 ], 0x10, %r5088; + // end inline asm + selp.b32 %r5121, 16, 0, %p383; + selp.b32 %r5090, %r5121, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5081 + 0 ], [ %rd402 + 0 ], 0x10, %r5090; + // end inline asm + selp.b32 %r5122, 16, 0, %p384; + selp.b32 %r5092, %r5122, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5083 + 0 ], [ %rd403 + 0 ], 0x10, %r5092; + // end inline asm + selp.b32 %r5123, 16, 0, %p385; + selp.b32 %r5094, %r5123, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5085 + 0 ], [ %rd404 + 0 ], 0x10, %r5094; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r2523 + 0 ], [ %rd405 + 0 ], 0x10, %r5088; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2525 + 0 ], [ %rd406 + 0 ], 0x10, %r5090; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2527 + 0 ], [ %rd407 + 0 ], 0x10, %r5092; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2529 + 0 ], [ %rd408 + 0 ], 0x10, %r5094; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.gt.s32 %p386, %r350, 1; + .loc 1 414 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:414:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s64 %rd1192, %rd401, 16384; + add.s64 %rd1191, %rd402, 16384; + add.s64 %rd1190, %rd403, 16384; + add.s64 %rd1189, %rd404, 16384; + .loc 1 415 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:415:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s64 %rd1188, %rd405, 16384; + add.s64 %rd1187, %rd406, 16384; + add.s64 %rd1186, %rd407, 16384; + add.s64 %rd1185, %rd408, 16384; + .loc 1 417 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:417:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + or.b32 %r14607, %r5111, 64; + or.b32 %r14606, %r5112, 64; + or.b32 %r14605, %r5113, 64; + or.b32 %r14604, %r5114, 64; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.lt.s32 %p387, %r14607, %r2359; + setp.lt.s32 %p388, %r14606, %r2359; + setp.lt.s32 %p389, %r14605, %r2359; + setp.lt.s32 %p390, %r14604, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + bar.sync 0; + selp.b32 %r5124, 16, 0, %p387; + selp.b32 %r5104, %r5124, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2531 + 0 ], [ %rd1192 + 0 ], 0x10, %r5104; + // end inline asm + selp.b32 %r5125, 16, 0, %p388; + selp.b32 %r5106, %r5125, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2533 + 0 ], [ %rd1191 + 0 ], 0x10, %r5106; + // end inline asm + selp.b32 %r5126, 16, 0, %p389; + selp.b32 %r5108, %r5126, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2535 + 0 ], [ %rd1190 + 0 ], 0x10, %r5108; + // end inline asm + selp.b32 %r5127, 16, 0, %p390; + selp.b32 %r5110, %r5127, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2537 + 0 ], [ %rd1189 + 0 ], 0x10, %r5110; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r2539 + 0 ], [ %rd1188 + 0 ], 0x10, %r5104; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2541 + 0 ], [ %rd1187 + 0 ], 0x10, %r5106; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2543 + 0 ], [ %rd1186 + 0 ], 0x10, %r5108; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2545 + 0 ], [ %rd1185 + 0 ], 0x10, %r5110; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:459:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + @%p380 bra $L__BB0_7; +$L__tmp33: +// %bb.5: // %.lr.ph1592 + .loc 1 218 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:218:33 + or.b32 %r14688, %r348, %r42; + or.b32 %r14687, %r348, %r41; + or.b32 %r14686, %r348, %r44; + or.b32 %r14681, %r348, %r46; + or.b32 %r14673, %r348, %r50; + or.b32 %r14685, %r348, %r45; + or.b32 %r14682, %r348, %r47; + or.b32 %r14674, %r348, %r51; + or.b32 %r14683, %r348, %r48; + or.b32 %r14675, %r348, %r52; + or.b32 %r14684, %r348, %r49; + or.b32 %r14676, %r348, %r53; + or.b32 %r14677, %r348, %r54; + or.b32 %r14678, %r348, %r55; + or.b32 %r14679, %r348, %r56; + or.b32 %r14680, %r348, %r57; + add.s32 %r435, %r350, -2; + add.s32 %r436, %r350, -1; +$L__tmp34: + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + max.s32 %r437, %r350, 1; + mov.b32 %r5683, 0; + mov.b32 %r14603, 1; + mov.b32 %r14602, -1; + mov.b32 %r14601, 64; + mov.b32 %r14672, %r5683; +$L__BB0_6: // %__nv_exp2f.exit1338 + // =>This Inner Loop Header: Depth=1 + setp.lt.s32 %p411, %r14672, %r435; + setp.lt.s32 %p409, %r14672, %r436; + add.s32 %r6790, %r14602, 1; + setp.gt.s32 %p412, %r6790, 2; + selp.b32 %r14602, 0, %r6790, %p412; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.lt.s32 %p413, %r14688, %r2359; + setp.lt.s32 %p414, %r14687, %r2359; + setp.lt.s32 %p415, %r14686, %r2359; + setp.lt.s32 %p416, %r14685, %r2359; + setp.lt.s32 %p417, %r14684, %r2359; + setp.lt.s32 %p418, %r14683, %r2359; + setp.lt.s32 %p419, %r14682, %r2359; + setp.lt.s32 %p420, %r14681, %r2359; + setp.lt.s32 %p421, %r14680, %r2359; + setp.lt.s32 %p422, %r14679, %r2359; + setp.lt.s32 %p423, %r14678, %r2359; + setp.lt.s32 %p424, %r14677, %r2359; + setp.lt.s32 %p425, %r14676, %r2359; + setp.lt.s32 %p426, %r14675, %r2359; + setp.lt.s32 %p427, %r14674, %r2359; + setp.lt.s32 %p428, %r14673, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r6791, %r14602, 14; + add.s32 %r5685, %r2590, %r6791; + .loc 1 459 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:459:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shfl.sync.idx.b32 %r6793, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r6794, %r6793, 11; + and.b32 %r6795, %r6794, 8192; + add.s32 %r5644, %r2590, 98304; + add.s32 %r6796, %r6795, %r5644; + bfe.u32 %r6797, %r6796, 4, 14; + cvt.u64.u32 %rd482, %r6797; + or.b64 %rd432, %rd482, 4611686293372403712; + bfe.u32 %r6798, %r5685, 4, 14; + cvt.u64.u32 %rd483, %r6798; + or.b64 %rd433, %rd483, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd432, %rd433, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r6799, %r6795, 32; + add.s32 %r6800, %r6799, %r5644; + bfe.u32 %r6801, %r6800, 4, 14; + cvt.u64.u32 %rd484, %r6801; + or.b64 %rd434, %rd484, 4611686293372403712; + add.s32 %r6802, %r5685, 32; + bfe.u32 %r6803, %r6802, 4, 14; + cvt.u64.u32 %rd485, %r6803; + or.b64 %rd435, %rd485, 4611686293338849280; + mov.pred %p391, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd434, %rd435, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6804, %r6795, 64; + add.s32 %r6805, %r6804, %r5644; + bfe.u32 %r6806, %r6805, 4, 14; + cvt.u64.u32 %rd486, %r6806; + or.b64 %rd436, %rd486, 4611686293372403712; + add.s32 %r6807, %r5685, 64; + bfe.u32 %r6808, %r6807, 4, 14; + cvt.u64.u32 %rd487, %r6808; + or.b64 %rd437, %rd487, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd436, %rd437, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6809, %r6795, 96; + add.s32 %r6810, %r6809, %r5644; + bfe.u32 %r6811, %r6810, 4, 14; + cvt.u64.u32 %rd488, %r6811; + or.b64 %rd438, %rd488, 4611686293372403712; + add.s32 %r6812, %r5685, 96; + bfe.u32 %r6813, %r6812, 4, 14; + cvt.u64.u32 %rd489, %r6813; + or.b64 %rd439, %rd489, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd438, %rd439, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6814, %r6795, 16384; + add.s32 %r6815, %r6814, %r5644; + bfe.u32 %r6816, %r6815, 4, 14; + cvt.u64.u32 %rd490, %r6816; + or.b64 %rd440, %rd490, 4611686293372403712; + add.s32 %r6817, %r5685, 8192; + bfe.u32 %r6818, %r6817, 4, 14; + cvt.u64.u32 %rd491, %r6818; + or.b64 %rd441, %rd491, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd440, %rd441, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6819, %r6795, 16416; + add.s32 %r6820, %r6819, %r5644; + bfe.u32 %r6821, %r6820, 4, 14; + cvt.u64.u32 %rd492, %r6821; + or.b64 %rd442, %rd492, 4611686293372403712; + add.s32 %r6822, %r5685, 8224; + bfe.u32 %r6823, %r6822, 4, 14; + cvt.u64.u32 %rd493, %r6823; + or.b64 %rd443, %rd493, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd442, %rd443, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6824, %r6795, 16448; + add.s32 %r6825, %r6824, %r5644; + bfe.u32 %r6826, %r6825, 4, 14; + cvt.u64.u32 %rd494, %r6826; + or.b64 %rd444, %rd494, 4611686293372403712; + add.s32 %r6827, %r5685, 8256; + bfe.u32 %r6828, %r6827, 4, 14; + cvt.u64.u32 %rd495, %r6828; + or.b64 %rd445, %rd495, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd444, %rd445, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6829, %r6795, 16480; + add.s32 %r6830, %r6829, %r5644; + bfe.u32 %r6831, %r6830, 4, 14; + cvt.u64.u32 %rd496, %r6831; + or.b64 %rd446, %rd496, 4611686293372403712; + add.s32 %r6832, %r5685, 8288; + bfe.u32 %r6833, %r6832, 4, 14; + cvt.u64.u32 %rd497, %r6833; + or.b64 %rd447, %rd497, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd446, %rd447, %p391, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r5647, %r5685; + mov.b32 %r5645, %r5683; + mov.b32 %r5646, %r5683; + mov.b32 %r5648, %r5683; + mov.b32 %r5649, %r5683; + // begin inline asm + // wait for regs: %r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259,%r5644,%r5645,%r5646,%r5647,%r5648,%r5649 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:461:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6834, %r5228, 0f3DB504F3; + mul.f32 %r6835, %r5229, 0f3DB504F3; + mul.f32 %r6836, %r5230, 0f3DB504F3; + mul.f32 %r6837, %r5231, 0f3DB504F3; + mul.f32 %r6838, %r5232, 0f3DB504F3; + mul.f32 %r6839, %r5233, 0f3DB504F3; + mul.f32 %r6840, %r5234, 0f3DB504F3; + mul.f32 %r6841, %r5235, 0f3DB504F3; + mul.f32 %r6842, %r5236, 0f3DB504F3; + mul.f32 %r6843, %r5237, 0f3DB504F3; + mul.f32 %r6844, %r5238, 0f3DB504F3; + mul.f32 %r6845, %r5239, 0f3DB504F3; + mul.f32 %r6846, %r5240, 0f3DB504F3; + mul.f32 %r6847, %r5241, 0f3DB504F3; + mul.f32 %r6848, %r5242, 0f3DB504F3; + mul.f32 %r6849, %r5243, 0f3DB504F3; + mul.f32 %r6850, %r5244, 0f3DB504F3; + mul.f32 %r6851, %r5245, 0f3DB504F3; + mul.f32 %r6852, %r5246, 0f3DB504F3; + mul.f32 %r6853, %r5247, 0f3DB504F3; + mul.f32 %r6854, %r5248, 0f3DB504F3; + mul.f32 %r6855, %r5249, 0f3DB504F3; + mul.f32 %r6856, %r5250, 0f3DB504F3; + mul.f32 %r6857, %r5251, 0f3DB504F3; + mul.f32 %r6858, %r5252, 0f3DB504F3; + mul.f32 %r6859, %r5253, 0f3DB504F3; + mul.f32 %r6860, %r5254, 0f3DB504F3; + mul.f32 %r6861, %r5255, 0f3DB504F3; + mul.f32 %r6862, %r5256, 0f3DB504F3; + mul.f32 %r6863, %r5257, 0f3DB504F3; + mul.f32 %r6864, %r5258, 0f3DB504F3; + mul.f32 %r6865, %r5259, 0f3DB504F3; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6866, %r6834, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6867, %r6866, 0fFF800000, %p413; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6868, %r6835, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6869, %r6868, 0fFF800000, %p414; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6870, %r6836, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6871, %r6870, 0fFF800000, %p413; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6872, %r6837, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6873, %r6872, 0fFF800000, %p414; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6874, %r6838, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6875, %r6874, 0fFF800000, %p415; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6876, %r6839, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6877, %r6876, 0fFF800000, %p416; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6878, %r6840, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6879, %r6878, 0fFF800000, %p415; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6880, %r6841, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6881, %r6880, 0fFF800000, %p416; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6882, %r6842, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6883, %r6882, 0fFF800000, %p417; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6884, %r6843, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6885, %r6884, 0fFF800000, %p418; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6886, %r6844, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6887, %r6886, 0fFF800000, %p417; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6888, %r6845, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6889, %r6888, 0fFF800000, %p418; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6890, %r6846, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6891, %r6890, 0fFF800000, %p419; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6892, %r6847, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6893, %r6892, 0fFF800000, %p420; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6894, %r6848, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6895, %r6894, 0fFF800000, %p419; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6896, %r6849, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6897, %r6896, 0fFF800000, %p420; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6898, %r6850, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6899, %r6898, 0fFF800000, %p421; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6900, %r6851, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6901, %r6900, 0fFF800000, %p422; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6902, %r6852, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6903, %r6902, 0fFF800000, %p421; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6904, %r6853, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6905, %r6904, 0fFF800000, %p422; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6906, %r6854, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6907, %r6906, 0fFF800000, %p423; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6908, %r6855, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6909, %r6908, 0fFF800000, %p424; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6910, %r6856, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6911, %r6910, 0fFF800000, %p423; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6912, %r6857, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6913, %r6912, 0fFF800000, %p424; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6914, %r6858, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6915, %r6914, 0fFF800000, %p425; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6916, %r6859, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6917, %r6916, 0fFF800000, %p426; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6918, %r6860, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6919, %r6918, 0fFF800000, %p425; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6920, %r6861, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6921, %r6920, 0fFF800000, %p426; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6922, %r6862, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6923, %r6922, 0fFF800000, %p427; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6924, %r6863, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6925, %r6924, 0fFF800000, %p428; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6926, %r6864, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6927, %r6926, 0fFF800000, %p427; + .loc 1 524 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:524:27 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r6928, %r6865, 0f3FB8AA3B; + .loc 1 476 79 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:476:79 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.f32 %r6929, %r6928, 0fFF800000, %p428; + .loc 1 525 39 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:525:39 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + sub.f32 %r6930, %r6867, %r35; + sub.f32 %r6931, %r6869, %r35; + sub.f32 %r6932, %r6871, %r36; + sub.f32 %r6933, %r6873, %r36; + sub.f32 %r6934, %r6875, %r35; + sub.f32 %r6935, %r6877, %r35; + sub.f32 %r6936, %r6879, %r36; + sub.f32 %r6937, %r6881, %r36; + sub.f32 %r6938, %r6883, %r35; + sub.f32 %r6939, %r6885, %r35; + sub.f32 %r6940, %r6887, %r36; + sub.f32 %r6941, %r6889, %r36; + sub.f32 %r6942, %r6891, %r35; + sub.f32 %r6943, %r6893, %r35; + sub.f32 %r6944, %r6895, %r36; + sub.f32 %r6945, %r6897, %r36; + sub.f32 %r6946, %r6899, %r35; + sub.f32 %r6947, %r6901, %r35; + sub.f32 %r6948, %r6903, %r36; + sub.f32 %r6949, %r6905, %r36; + sub.f32 %r6950, %r6907, %r35; + sub.f32 %r6951, %r6909, %r35; + sub.f32 %r6952, %r6911, %r36; + sub.f32 %r6953, %r6913, %r36; + sub.f32 %r6954, %r6915, %r35; + sub.f32 %r6955, %r6917, %r35; + sub.f32 %r6956, %r6919, %r36; + sub.f32 %r6957, %r6921, %r36; + sub.f32 %r6958, %r6923, %r35; + sub.f32 %r6959, %r6925, %r35; + sub.f32 %r6960, %r6927, %r36; + sub.f32 %r6961, %r6929, %r36; + .loc 1 525 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:525:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + ex2.approx.ftz.f32 %r6962, %r6930; + ex2.approx.ftz.f32 %r6963, %r6931; + ex2.approx.ftz.f32 %r6964, %r6932; + ex2.approx.ftz.f32 %r6965, %r6933; + ex2.approx.ftz.f32 %r6966, %r6934; + ex2.approx.ftz.f32 %r6967, %r6935; + ex2.approx.ftz.f32 %r6968, %r6936; + ex2.approx.ftz.f32 %r6969, %r6937; + ex2.approx.ftz.f32 %r6970, %r6938; + ex2.approx.ftz.f32 %r6971, %r6939; + ex2.approx.ftz.f32 %r6972, %r6940; + ex2.approx.ftz.f32 %r6973, %r6941; + ex2.approx.ftz.f32 %r6974, %r6942; + ex2.approx.ftz.f32 %r6975, %r6943; + ex2.approx.ftz.f32 %r6976, %r6944; + ex2.approx.ftz.f32 %r6977, %r6945; + ex2.approx.ftz.f32 %r6978, %r6946; + ex2.approx.ftz.f32 %r6979, %r6947; + ex2.approx.ftz.f32 %r6980, %r6948; + ex2.approx.ftz.f32 %r6981, %r6949; + ex2.approx.ftz.f32 %r6982, %r6950; + ex2.approx.ftz.f32 %r6983, %r6951; + ex2.approx.ftz.f32 %r6984, %r6952; + ex2.approx.ftz.f32 %r6985, %r6953; + ex2.approx.ftz.f32 %r6986, %r6954; + ex2.approx.ftz.f32 %r6987, %r6955; + ex2.approx.ftz.f32 %r6988, %r6956; + ex2.approx.ftz.f32 %r6989, %r6957; + ex2.approx.ftz.f32 %r6990, %r6958; + ex2.approx.ftz.f32 %r6991, %r6959; + ex2.approx.ftz.f32 %r6992, %r6960; + ex2.approx.ftz.f32 %r6993, %r6961; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r6994, %r2590, 49152; + add.s32 %r6203, %r6994, %r6791; + .loc 1 530 20 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:530:20 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + wgmma.fence.sync.aligned; + add.s32 %r6200, %r2590, 131072; + add.s32 %r6995, %r6795, %r6200; + bfe.u32 %r6996, %r6995, 4, 14; + cvt.u64.u32 %rd498, %r6996; + or.b64 %rd448, %rd498, 4611686293372403712; + bfe.u32 %r6997, %r6203, 4, 14; + cvt.u64.u32 %rd499, %r6997; + or.b64 %rd449, %rd499, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd448, %rd449, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r6998, %r6799, %r6200; + bfe.u32 %r6999, %r6998, 4, 14; + cvt.u64.u32 %rd500, %r6999; + or.b64 %rd450, %rd500, 4611686293372403712; + add.s32 %r7000, %r6203, 32; + bfe.u32 %r7001, %r7000, 4, 14; + cvt.u64.u32 %rd501, %r7001; + or.b64 %rd451, %rd501, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd450, %rd451, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7002, %r6804, %r6200; + bfe.u32 %r7003, %r7002, 4, 14; + cvt.u64.u32 %rd502, %r7003; + or.b64 %rd452, %rd502, 4611686293372403712; + add.s32 %r7004, %r6203, 64; + bfe.u32 %r7005, %r7004, 4, 14; + cvt.u64.u32 %rd503, %r7005; + or.b64 %rd453, %rd503, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd452, %rd453, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7006, %r6809, %r6200; + bfe.u32 %r7007, %r7006, 4, 14; + cvt.u64.u32 %rd504, %r7007; + or.b64 %rd454, %rd504, 4611686293372403712; + add.s32 %r7008, %r6203, 96; + bfe.u32 %r7009, %r7008, 4, 14; + cvt.u64.u32 %rd505, %r7009; + or.b64 %rd455, %rd505, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd454, %rd455, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7010, %r6814, %r6200; + bfe.u32 %r7011, %r7010, 4, 14; + cvt.u64.u32 %rd506, %r7011; + or.b64 %rd456, %rd506, 4611686293372403712; + add.s32 %r7012, %r6203, 8192; + bfe.u32 %r7013, %r7012, 4, 14; + cvt.u64.u32 %rd507, %r7013; + or.b64 %rd457, %rd507, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd456, %rd457, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7014, %r6819, %r6200; + bfe.u32 %r7015, %r7014, 4, 14; + cvt.u64.u32 %rd508, %r7015; + or.b64 %rd458, %rd508, 4611686293372403712; + add.s32 %r7016, %r6203, 8224; + bfe.u32 %r7017, %r7016, 4, 14; + cvt.u64.u32 %rd509, %r7017; + or.b64 %rd459, %rd509, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd458, %rd459, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7018, %r6824, %r6200; + bfe.u32 %r7019, %r7018, 4, 14; + cvt.u64.u32 %rd510, %r7019; + or.b64 %rd460, %rd510, 4611686293372403712; + add.s32 %r7020, %r6203, 8256; + bfe.u32 %r7021, %r7020, 4, 14; + cvt.u64.u32 %rd511, %r7021; + or.b64 %rd461, %rd511, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd460, %rd461, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7022, %r6829, %r6200; + bfe.u32 %r7023, %r7022, 4, 14; + cvt.u64.u32 %rd512, %r7023; + or.b64 %rd462, %rd512, 4611686293372403712; + add.s32 %r7024, %r6203, 8288; + bfe.u32 %r7025, %r7024, 4, 14; + cvt.u64.u32 %rd513, %r7025; + or.b64 %rd463, %rd513, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd462, %rd463, %p391, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r6205, %r5683; + mov.b32 %r6201, %r5683; + mov.b32 %r6202, %r5683; + mov.b32 %r6204, %r5683; + // begin inline asm + // wait for regs: %r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815,%r6200,%r6201,%r6202,%r6203,%r6204,%r6205 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + sub.f32 %r7026, %r5784, %r2509; + sub.f32 %r7027, %r5785, %r2509; + sub.f32 %r7028, %r5786, %r2510; + sub.f32 %r7029, %r5787, %r2510; + sub.f32 %r7030, %r5788, %r2509; + sub.f32 %r7031, %r5789, %r2509; + sub.f32 %r7032, %r5790, %r2510; + sub.f32 %r7033, %r5791, %r2510; + sub.f32 %r7034, %r5792, %r2509; + sub.f32 %r7035, %r5793, %r2509; + sub.f32 %r7036, %r5794, %r2510; + sub.f32 %r7037, %r5795, %r2510; + sub.f32 %r7038, %r5796, %r2509; + sub.f32 %r7039, %r5797, %r2509; + sub.f32 %r7040, %r5798, %r2510; + sub.f32 %r7041, %r5799, %r2510; + sub.f32 %r7042, %r5800, %r2509; + sub.f32 %r7043, %r5801, %r2509; + sub.f32 %r7044, %r5802, %r2510; + sub.f32 %r7045, %r5803, %r2510; + sub.f32 %r7046, %r5804, %r2509; + sub.f32 %r7047, %r5805, %r2509; + sub.f32 %r7048, %r5806, %r2510; + sub.f32 %r7049, %r5807, %r2510; + sub.f32 %r7050, %r5808, %r2509; + sub.f32 %r7051, %r5809, %r2509; + sub.f32 %r7052, %r5810, %r2510; + sub.f32 %r7053, %r5811, %r2510; + sub.f32 %r7054, %r5812, %r2509; + sub.f32 %r7055, %r5813, %r2509; + sub.f32 %r7056, %r5814, %r2510; + sub.f32 %r7057, %r5815, %r2510; + .loc 1 531 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:531:14 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.f32 %r7058, %r6962, %r7026; + mul.f32 %r7059, %r6963, %r7027; + mul.f32 %r7060, %r6964, %r7028; + mul.f32 %r7061, %r6965, %r7029; + mul.f32 %r7062, %r6966, %r7030; + mul.f32 %r7063, %r6967, %r7031; + mul.f32 %r7064, %r6968, %r7032; + mul.f32 %r7065, %r6969, %r7033; + mul.f32 %r7066, %r6970, %r7034; + mul.f32 %r7067, %r6971, %r7035; + mul.f32 %r7068, %r6972, %r7036; + mul.f32 %r7069, %r6973, %r7037; + mul.f32 %r7070, %r6974, %r7038; + mul.f32 %r7071, %r6975, %r7039; + mul.f32 %r7072, %r6976, %r7040; + mul.f32 %r7073, %r6977, %r7041; + mul.f32 %r7074, %r6978, %r7042; + mul.f32 %r7075, %r6979, %r7043; + mul.f32 %r7076, %r6980, %r7044; + mul.f32 %r7077, %r6981, %r7045; + mul.f32 %r7078, %r6982, %r7046; + mul.f32 %r7079, %r6983, %r7047; + mul.f32 %r7080, %r6984, %r7048; + mul.f32 %r7081, %r6985, %r7049; + mul.f32 %r7082, %r6986, %r7050; + mul.f32 %r7083, %r6987, %r7051; + mul.f32 %r7084, %r6988, %r7052; + mul.f32 %r7085, %r6989, %r7053; + mul.f32 %r7086, %r6990, %r7054; + mul.f32 %r7087, %r6991, %r7055; + mul.f32 %r7088, %r6992, %r7056; + mul.f32 %r7089, %r6993, %r7057; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs65, %r7058; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs66, %rs65, 0x0000, %p413; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs67, %r7059; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs68, %rs67, 0x0000, %p414; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs69, %r7060; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs70, %rs69, 0x0000, %p413; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs71, %r7061; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs72, %rs71, 0x0000, %p414; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs73, %r7062; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs74, %rs73, 0x0000, %p415; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs75, %r7063; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs76, %rs75, 0x0000, %p416; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs77, %r7064; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs78, %rs77, 0x0000, %p415; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs79, %r7065; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs80, %rs79, 0x0000, %p416; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs81, %r7066; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs82, %rs81, 0x0000, %p417; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs83, %r7067; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs84, %rs83, 0x0000, %p418; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs85, %r7068; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs86, %rs85, 0x0000, %p417; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs87, %r7069; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs88, %rs87, 0x0000, %p418; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs89, %r7070; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs90, %rs89, 0x0000, %p419; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs91, %r7071; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs92, %rs91, 0x0000, %p420; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs93, %r7072; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs94, %rs93, 0x0000, %p419; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs95, %r7073; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs96, %rs95, 0x0000, %p420; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs97, %r7074; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs98, %rs97, 0x0000, %p421; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs99, %r7075; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs100, %rs99, 0x0000, %p422; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs101, %r7076; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs102, %rs101, 0x0000, %p421; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs103, %r7077; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs104, %rs103, 0x0000, %p422; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs105, %r7078; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs106, %rs105, 0x0000, %p423; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs107, %r7079; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs108, %rs107, 0x0000, %p424; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs109, %r7080; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs110, %rs109, 0x0000, %p423; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs111, %r7081; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs112, %rs111, 0x0000, %p424; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs113, %r7082; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs114, %rs113, 0x0000, %p425; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs115, %r7083; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs116, %rs115, 0x0000, %p426; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs117, %r7084; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs118, %rs117, 0x0000, %p425; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs119, %r7085; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs120, %rs119, 0x0000, %p426; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs121, %r7086; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs122, %rs121, 0x0000, %p427; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs123, %r7087; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs124, %rs123, 0x0000, %p428; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs125, %r7088; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs126, %rs125, 0x0000, %p427; + .loc 1 551 15 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:551:15 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + cvt.rn.bf16.f32 %rs127, %r7089; + .loc 1 538 71 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:538:71 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + selp.b16 %rs128, %rs127, 0x0000, %p428; + .loc 1 553 21 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:553:21 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mov.b32 %r6372, {%rs66, %rs68}; + mov.b32 %r6373, {%rs70, %rs72}; + mov.b32 %r6374, {%rs74, %rs76}; + mov.b32 %r6375, {%rs78, %rs80}; + mov.b32 %r6504, {%rs82, %rs84}; + mov.b32 %r6505, {%rs86, %rs88}; + mov.b32 %r6506, {%rs90, %rs92}; + mov.b32 %r6507, {%rs94, %rs96}; + mov.b32 %r6636, {%rs98, %rs100}; + mov.b32 %r6637, {%rs102, %rs104}; + mov.b32 %r6638, {%rs106, %rs108}; + mov.b32 %r6639, {%rs110, %rs112}; + mov.b32 %r6768, {%rs114, %rs116}; + mov.b32 %r6769, {%rs118, %rs120}; + mov.b32 %r6770, {%rs122, %rs124}; + mov.b32 %r6771, {%rs126, %rs128}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6372,%r6373,%r6374,%r6375}, %rd433, %p391, 1, 1, 1; + // end inline asm + add.s32 %r7090, %r5685, 2048; + bfe.u32 %r7091, %r7090, 4, 14; + cvt.u64.u32 %rd514, %r7091; + or.b64 %rd465, %rd514, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6504,%r6505,%r6506,%r6507}, %rd465, %p391, 1, 1, 1; + // end inline asm + add.s32 %r7092, %r5685, 4096; + bfe.u32 %r7093, %r7092, 4, 14; + cvt.u64.u32 %rd515, %r7093; + or.b64 %rd466, %rd515, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6636,%r6637,%r6638,%r6639}, %rd466, %p391, 1, 1, 1; + // end inline asm + add.s32 %r7094, %r5685, 6144; + bfe.u32 %r7095, %r7094, 4, 14; + cvt.u64.u32 %rd516, %r7095; + or.b64 %rd467, %rd516, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6768,%r6769,%r6770,%r6771}, %rd467, %p391, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:417:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r14673, %r14601, %r14673; + add.s32 %r14674, %r14601, %r14674; + add.s32 %r14675, %r14601, %r14675; + add.s32 %r14676, %r14601, %r14676; + add.s32 %r14677, %r14601, %r14677; + add.s32 %r14678, %r14601, %r14678; + add.s32 %r14679, %r14601, %r14679; + add.s32 %r14680, %r14601, %r14680; + add.s32 %r14681, %r14601, %r14681; + add.s32 %r14682, %r14601, %r14682; + add.s32 %r14683, %r14601, %r14683; + add.s32 %r14684, %r14601, %r14684; + add.s32 %r14685, %r14601, %r14685; + add.s32 %r14686, %r14601, %r14686; + add.s32 %r14687, %r14601, %r14687; + add.s32 %r14688, %r14601, %r14688; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r607, %r14672, 1; + .loc 1 788 33 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:788:33 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shr.u32 %r7096, %r607, 1; + .loc 1 789 38 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:38 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mad.wide.u32 %rd469, %r7096, 4, %rd399; + .loc 1 789 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:789:24 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + // begin inline asm + mov.u64 %rd468, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd468, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6772, 0x0; + @%p409 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6772 }, [ %rd469 + 0 ], %rd468; + // end inline asm + .loc 1 790 109 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:109 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r7097, %r7096, 1; + .loc 1 790 113 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:113 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.lt.s32 %p429, %r7097, %r5078; + .loc 1 790 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:55 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s64 %rd472, %rd469, 4; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + and.pred %p410, %p409, %p429; + .loc 1 790 25 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:790:25 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + // begin inline asm + mov.u64 %rd471, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd471, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6773, 0x0; + @%p410 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6773 }, [ %rd472 + 0 ], %rd471; + // end inline asm + .loc 1 791 35 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:791:35 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + and.b32 %r7098, %r14672, 1; + .loc 1 792 34 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:34 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + sub.s32 %r7099, %r6773, %r6772; + .loc 1 792 48 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:48 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b32 %r7100, %r7099, 7; + .loc 1 792 63 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:792:63 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r7101, %r7100, -64; + .loc 1 793 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:29 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + xor.b32 %r7102, %r7098, 1; + .loc 1 793 61 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:61 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b32 %r7103, %r7098, 6; + .loc 1 793 42 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:793:42 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mad.lo.s32 %r14601, %r7101, %r7102, %r7103; + .loc 1 414 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:414:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b32 %r7104, %r14601, 7; + .loc 1 414 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:414:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + mul.wide.s32 %rd517, %r7104, 2; + add.s64 %rd1192, %rd1192, %rd517; + add.s64 %rd1191, %rd1191, %rd517; + add.s64 %rd1190, %rd1190, %rd517; + add.s64 %rd1189, %rd1189, %rd517; + .loc 1 415 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:415:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s64 %rd1188, %rd1188, %rd517; + add.s64 %rd1187, %rd1187, %rd517; + add.s64 %rd1186, %rd1186, %rd517; + add.s64 %rd1185, %rd1185, %rd517; + .loc 1 417 19 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:417:19 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r14607, %r14601, %r14607; + add.s32 %r14606, %r14601, %r14606; + add.s32 %r14605, %r14601, %r14605; + add.s32 %r14604, %r14601, %r14604; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + add.s32 %r7105, %r14603, 1; + setp.gt.s32 %p430, %r7105, 2; + selp.b32 %r14603, 0, %r7105, %p430; + .loc 1 831 52 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:52 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.lt.s32 %p431, %r14607, %r2359; + setp.lt.s32 %p432, %r14606, %r2359; + setp.lt.s32 %p433, %r14605, %r2359; + setp.lt.s32 %p434, %r14604, %r2359; + .loc 1 831 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:831:23 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + shl.b32 %r7106, %r14603, 14; + add.s32 %r7107, %r2590, %r7106; + bar.sync 0; + add.s32 %r6774, %r7107, %r60; + selp.b32 %r7108, 16, 0, %p431; + selp.b32 %r6783, %r7108, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6774 + 0 ], [ %rd1192 + 0 ], 0x10, %r6783; + // end inline asm + add.s32 %r6776, %r6774, 2048; + selp.b32 %r7109, 16, 0, %p432; + selp.b32 %r6785, %r7109, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6776 + 0 ], [ %rd1191 + 0 ], 0x10, %r6785; + // end inline asm + add.s32 %r6778, %r6774, 4096; + selp.b32 %r7110, 16, 0, %p433; + selp.b32 %r6787, %r7110, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6778 + 0 ], [ %rd1190 + 0 ], 0x10, %r6787; + // end inline asm + add.s32 %r6780, %r6774, 6144; + selp.b32 %r7111, 16, 0, %p434; + selp.b32 %r6789, %r7111, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6780 + 0 ], [ %rd1189 + 0 ], 0x10, %r6789; + // end inline asm + cp.async.commit_group; + add.s32 %r7112, %r6994, %r7106; + add.s32 %r6782, %r7112, %r60; + // begin inline asm + cp.async.cg.shared.global [ %r6782 + 0 ], [ %rd1188 + 0 ], 0x10, %r6783; + // end inline asm + add.s32 %r6784, %r6782, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r6784 + 0 ], [ %rd1187 + 0 ], 0x10, %r6785; + // end inline asm + add.s32 %r6786, %r6782, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r6786 + 0 ], [ %rd1186 + 0 ], 0x10, %r6787; + // end inline asm + add.s32 %r6788, %r6782, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r6788 + 0 ], [ %rd1185 + 0 ], 0x10, %r6789; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:397:28 @[ c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:226:16 ] + setp.ne.b32 %p435, %r437, %r607; + mov.b32 %r14672, %r607; + @%p435 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge1593 + // begin inline asm + // wait for regs: %r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp35: + .loc 1 231 24 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:231:24 + shl.b64 %rd526, %rd5, 1; + add.s64 %rd527, %rd4, %rd526; + shl.b64 %rd528, %rd6, 1; + add.s64 %rd529, %rd4, %rd528; + shl.b64 %rd530, %rd7, 1; + add.s64 %rd531, %rd4, %rd530; + shl.b64 %rd532, %rd8, 1; + add.s64 %rd533, %rd4, %rd532; + shl.b64 %rd534, %rd9, 1; + add.s64 %rd535, %rd4, %rd534; + shl.b64 %rd536, %rd10, 1; + add.s64 %rd537, %rd4, %rd536; + shl.b64 %rd538, %rd11, 1; + add.s64 %rd539, %rd4, %rd538; + shl.b64 %rd540, %rd12, 1; + add.s64 %rd541, %rd4, %rd540; + .loc 1 231 56 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:231:56 + add.s64 %rd518, %rd527, %rd427; + add.s64 %rd519, %rd529, %rd427; + add.s64 %rd520, %rd531, %rd427; + add.s64 %rd521, %rd533, %rd427; + add.s64 %rd522, %rd535, %rd427; + add.s64 %rd523, %rd537, %rd427; + add.s64 %rd524, %rd539, %rd427; + add.s64 %rd525, %rd541, %rd427; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7313, %r14456, 0f3DB504F3; + mul.f32 %r7314, %r14457, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7315, %r7314, %r7313; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7316, %r14458, 0f3DB504F3; + mul.f32 %r7317, %r14459, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7318, %r7317, %r7316; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7319, %r14460, 0f3DB504F3; + mul.f32 %r7320, %r14461, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7321, %r7320, %r7319; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7322, %r14462, 0f3DB504F3; + mul.f32 %r7323, %r14463, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7324, %r7323, %r7322; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7325, %r14464, 0f3DB504F3; + mul.f32 %r7326, %r14465, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7327, %r7326, %r7325; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7328, %r14466, 0f3DB504F3; + mul.f32 %r7329, %r14467, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7330, %r7329, %r7328; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7331, %r14468, 0f3DB504F3; + mul.f32 %r7332, %r14469, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7333, %r7332, %r7331; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7334, %r14470, 0f3DB504F3; + mul.f32 %r7335, %r14471, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7336, %r7335, %r7334; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7337, %r14472, 0f3DB504F3; + mul.f32 %r7338, %r14473, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7339, %r7338, %r7337; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7340, %r14474, 0f3DB504F3; + mul.f32 %r7341, %r14475, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7342, %r7341, %r7340; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7343, %r14476, 0f3DB504F3; + mul.f32 %r7344, %r14477, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7345, %r7344, %r7343; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7346, %r14478, 0f3DB504F3; + mul.f32 %r7347, %r14479, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7348, %r7347, %r7346; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7349, %r14480, 0f3DB504F3; + mul.f32 %r7350, %r14481, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7351, %r7350, %r7349; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7352, %r14482, 0f3DB504F3; + mul.f32 %r7353, %r14483, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7354, %r7353, %r7352; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7355, %r14484, 0f3DB504F3; + mul.f32 %r7356, %r14485, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7357, %r7356, %r7355; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7358, %r14486, 0f3DB504F3; + mul.f32 %r7359, %r14487, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7360, %r7359, %r7358; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7361, %r14488, 0f3DB504F3; + mul.f32 %r7362, %r14489, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7363, %r7362, %r7361; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7364, %r14490, 0f3DB504F3; + mul.f32 %r7365, %r14491, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7366, %r7365, %r7364; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7367, %r14492, 0f3DB504F3; + mul.f32 %r7368, %r14493, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7369, %r7368, %r7367; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7370, %r14494, 0f3DB504F3; + mul.f32 %r7371, %r14495, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7372, %r7371, %r7370; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7373, %r14496, 0f3DB504F3; + mul.f32 %r7374, %r14497, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7375, %r7374, %r7373; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7376, %r14498, 0f3DB504F3; + mul.f32 %r7377, %r14499, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7378, %r7377, %r7376; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7379, %r14500, 0f3DB504F3; + mul.f32 %r7380, %r14501, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7381, %r7380, %r7379; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7382, %r14502, 0f3DB504F3; + mul.f32 %r7383, %r14503, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7384, %r7383, %r7382; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7385, %r14504, 0f3DB504F3; + mul.f32 %r7386, %r14505, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7387, %r7386, %r7385; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7388, %r14506, 0f3DB504F3; + mul.f32 %r7389, %r14507, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7390, %r7389, %r7388; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7391, %r14508, 0f3DB504F3; + mul.f32 %r7392, %r14509, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7393, %r7392, %r7391; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7394, %r14510, 0f3DB504F3; + mul.f32 %r7395, %r14511, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7396, %r7395, %r7394; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7397, %r14512, 0f3DB504F3; + mul.f32 %r7398, %r14513, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7399, %r7398, %r7397; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7400, %r14514, 0f3DB504F3; + mul.f32 %r7401, %r14515, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7402, %r7401, %r7400; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7403, %r14516, 0f3DB504F3; + mul.f32 %r7404, %r14517, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7405, %r7404, %r7403; + .loc 1 232 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:232:14 + mul.f32 %r7406, %r14518, 0f3DB504F3; + mul.f32 %r7407, %r14519, 0f3DB504F3; + .loc 1 236 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:236:30 + cvt.rn.bf16x2.f32 %r7408, %r7407, %r7406; + shl.b32 %r7409, %r39, 13; + shl.b32 %r7410, %r10, 5; + and.b32 %r7411, %r7410, 7264; + and.b32 %r7412, %r10, 24; + shl.b32 %r7413, %r7412, 4; + shl.b32 %r7414, %r10, 2; + and.b32 %r7415, %r7414, 16; + or.b32 %r7416, %r7409, %r7415; + or.b32 %r7417, %r7411, %r7413; + or.b32 %r7418, %r7416, %r7417; + add.s32 %r7420, %r2590, %r7418; + st.shared.v4.b32 [%r7420], {%r7315, %r7321, %r7327, %r7333}; + st.shared.v4.b32 [%r7420+512], {%r7318, %r7324, %r7330, %r7336}; + xor.b32 %r7421, %r7418, 32; + add.s32 %r7422, %r2590, %r7421; + st.shared.v4.b32 [%r7422], {%r7339, %r7345, %r7351, %r7357}; + st.shared.v4.b32 [%r7422+512], {%r7342, %r7348, %r7354, %r7360}; + xor.b32 %r7423, %r7418, 64; + add.s32 %r7424, %r2590, %r7423; + st.shared.v4.b32 [%r7424], {%r7363, %r7369, %r7375, %r7381}; + st.shared.v4.b32 [%r7424+512], {%r7366, %r7372, %r7378, %r7384}; + xor.b32 %r7425, %r7418, 96; + add.s32 %r7426, %r2590, %r7425; + st.shared.v4.b32 [%r7426], {%r7387, %r7393, %r7399, %r7405}; + st.shared.v4.b32 [%r7426+512], {%r7390, %r7396, %r7402, %r7408}; + bar.sync 0; + shl.b32 %r7427, %r7412, 10; + shl.b32 %r7428, %r39, 5; + and.b32 %r7429, %r7414, 1008; + or.b32 %r7430, %r7427, %r7428; + xor.b32 %r7431, %r7430, %r7429; + add.s32 %r7245, %r2590, %r7431; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7281, %r7282, %r7283, %r7284}, [%r7245]; + // end inline asm + add.s32 %r7250, %r7245, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7285, %r7286, %r7287, %r7288}, [%r7250]; + // end inline asm + add.s32 %r7255, %r7245, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7289, %r7290, %r7291, %r7292}, [%r7255]; + // end inline asm + add.s32 %r7260, %r7245, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7293, %r7294, %r7295, %r7296}, [%r7260]; + // end inline asm + add.s32 %r7265, %r7245, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7297, %r7298, %r7299, %r7300}, [%r7265]; + // end inline asm + add.s32 %r7270, %r7245, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7301, %r7302, %r7303, %r7304}, [%r7270]; + // end inline asm + add.s32 %r7275, %r7245, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7305, %r7306, %r7307, %r7308}, [%r7275]; + // end inline asm + add.s32 %r7280, %r7245, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7309, %r7310, %r7311, %r7312}, [%r7280]; + // end inline asm + // begin inline asm + @%p14 st.global.v4.b32 [ %rd518 + 0 ], { %r7281, %r7282, %r7283, %r7284 }; + // end inline asm + // begin inline asm + @%p15 st.global.v4.b32 [ %rd519 + 0 ], { %r7285, %r7286, %r7287, %r7288 }; + // end inline asm + // begin inline asm + @%p16 st.global.v4.b32 [ %rd520 + 0 ], { %r7289, %r7290, %r7291, %r7292 }; + // end inline asm + // begin inline asm + @%p17 st.global.v4.b32 [ %rd521 + 0 ], { %r7293, %r7294, %r7295, %r7296 }; + // end inline asm + // begin inline asm + @%p18 st.global.v4.b32 [ %rd522 + 0 ], { %r7297, %r7298, %r7299, %r7300 }; + // end inline asm + // begin inline asm + @%p19 st.global.v4.b32 [ %rd523 + 0 ], { %r7301, %r7302, %r7303, %r7304 }; + // end inline asm + // begin inline asm + @%p20 st.global.v4.b32 [ %rd524 + 0 ], { %r7305, %r7306, %r7307, %r7308 }; + // end inline asm + // begin inline asm + @%p21 st.global.v4.b32 [ %rd525 + 0 ], { %r7309, %r7310, %r7311, %r7312 }; + // end inline asm + .loc 1 139 7 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:139:7 + bra.uni $L__BB0_17; +$L__BB0_16: + .loc 1 323 23 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:323:23 + shl.b64 %rd1152, %rd71, 1; + add.s64 %rd1153, %rd3, %rd1152; + shl.b64 %rd1154, %rd72, 1; + add.s64 %rd1155, %rd3, %rd1154; + shl.b64 %rd1156, %rd73, 1; + add.s64 %rd1157, %rd3, %rd1156; + shl.b64 %rd1158, %rd74, 1; + add.s64 %rd1159, %rd3, %rd1158; + shl.b64 %rd1160, %rd75, 1; + add.s64 %rd1161, %rd3, %rd1160; + shl.b64 %rd1162, %rd76, 1; + add.s64 %rd1163, %rd3, %rd1162; + shl.b64 %rd1164, %rd77, 1; + add.s64 %rd1165, %rd3, %rd1164; + shl.b64 %rd1166, %rd78, 1; + add.s64 %rd1167, %rd3, %rd1166; + .loc 1 323 55 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:323:55 + add.s64 %rd1136, %rd1153, %rd685; + add.s64 %rd1137, %rd1155, %rd685; + add.s64 %rd1138, %rd1157, %rd685; + add.s64 %rd1139, %rd1159, %rd685; + add.s64 %rd1140, %rd1161, %rd685; + add.s64 %rd1141, %rd1163, %rd685; + add.s64 %rd1142, %rd1165, %rd685; + add.s64 %rd1143, %rd1167, %rd685; + .loc 1 332 30 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:332:30 + cvt.rn.bf16x2.f32 %r14282, %r14925, %r14924; + cvt.rn.bf16x2.f32 %r14283, %r14927, %r14926; + cvt.rn.bf16x2.f32 %r14284, %r14929, %r14928; + cvt.rn.bf16x2.f32 %r14285, %r14931, %r14930; + cvt.rn.bf16x2.f32 %r14286, %r14933, %r14932; + cvt.rn.bf16x2.f32 %r14287, %r14935, %r14934; + cvt.rn.bf16x2.f32 %r14288, %r14937, %r14936; + cvt.rn.bf16x2.f32 %r14289, %r14939, %r14938; + cvt.rn.bf16x2.f32 %r14290, %r14941, %r14940; + cvt.rn.bf16x2.f32 %r14291, %r14943, %r14942; + cvt.rn.bf16x2.f32 %r14292, %r14945, %r14944; + cvt.rn.bf16x2.f32 %r14293, %r14947, %r14946; + cvt.rn.bf16x2.f32 %r14294, %r14949, %r14948; + cvt.rn.bf16x2.f32 %r14295, %r14951, %r14950; + cvt.rn.bf16x2.f32 %r14296, %r14953, %r14952; + cvt.rn.bf16x2.f32 %r14297, %r14955, %r14954; + cvt.rn.bf16x2.f32 %r14298, %r14957, %r14956; + cvt.rn.bf16x2.f32 %r14299, %r14959, %r14958; + cvt.rn.bf16x2.f32 %r14300, %r14961, %r14960; + cvt.rn.bf16x2.f32 %r14301, %r14963, %r14962; + cvt.rn.bf16x2.f32 %r14302, %r14965, %r14964; + cvt.rn.bf16x2.f32 %r14303, %r14967, %r14966; + cvt.rn.bf16x2.f32 %r14304, %r14969, %r14968; + cvt.rn.bf16x2.f32 %r14305, %r14971, %r14970; + cvt.rn.bf16x2.f32 %r14306, %r14973, %r14972; + cvt.rn.bf16x2.f32 %r14307, %r14975, %r14974; + cvt.rn.bf16x2.f32 %r14308, %r14977, %r14976; + cvt.rn.bf16x2.f32 %r14309, %r14979, %r14978; + cvt.rn.bf16x2.f32 %r14310, %r14981, %r14980; + cvt.rn.bf16x2.f32 %r14311, %r14983, %r14982; + cvt.rn.bf16x2.f32 %r14312, %r14985, %r14984; + cvt.rn.bf16x2.f32 %r14313, %r14987, %r14986; + shl.b32 %r14314, %r691, 13; + shl.b32 %r14315, %r10, 5; + and.b32 %r14316, %r14315, 7264; + and.b32 %r14317, %r10, 24; + shl.b32 %r14318, %r14317, 4; + shl.b32 %r14319, %r10, 2; + and.b32 %r14320, %r14319, 16; + or.b32 %r14321, %r14314, %r14320; + or.b32 %r14322, %r14316, %r14318; + or.b32 %r14323, %r14321, %r14322; + add.s32 %r14325, %r7585, %r14323; + st.shared.v4.b32 [%r14325], {%r14282, %r14284, %r14286, %r14288}; + st.shared.v4.b32 [%r14325+512], {%r14283, %r14285, %r14287, %r14289}; + xor.b32 %r14326, %r14323, 32; + add.s32 %r14327, %r7585, %r14326; + st.shared.v4.b32 [%r14327], {%r14290, %r14292, %r14294, %r14296}; + st.shared.v4.b32 [%r14327+512], {%r14291, %r14293, %r14295, %r14297}; + xor.b32 %r14328, %r14323, 64; + add.s32 %r14329, %r7585, %r14328; + st.shared.v4.b32 [%r14329], {%r14298, %r14300, %r14302, %r14304}; + st.shared.v4.b32 [%r14329+512], {%r14299, %r14301, %r14303, %r14305}; + xor.b32 %r14330, %r14323, 96; + add.s32 %r14331, %r7585, %r14330; + st.shared.v4.b32 [%r14331], {%r14306, %r14308, %r14310, %r14312}; + st.shared.v4.b32 [%r14331+512], {%r14307, %r14309, %r14311, %r14313}; + bar.sync 0; + shl.b32 %r14332, %r14317, 10; + shl.b32 %r14333, %r691, 5; + and.b32 %r14334, %r14319, 1008; + or.b32 %r14335, %r14332, %r14333; + xor.b32 %r14336, %r14335, %r14334; + add.s32 %r14142, %r7585, %r14336; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14178, %r14179, %r14180, %r14181}, [%r14142]; + // end inline asm + add.s32 %r14147, %r14142, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14182, %r14183, %r14184, %r14185}, [%r14147]; + // end inline asm + add.s32 %r14152, %r14142, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14186, %r14187, %r14188, %r14189}, [%r14152]; + // end inline asm + add.s32 %r14157, %r14142, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14190, %r14191, %r14192, %r14193}, [%r14157]; + // end inline asm + add.s32 %r14162, %r14142, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14194, %r14195, %r14196, %r14197}, [%r14162]; + // end inline asm + add.s32 %r14167, %r14142, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14198, %r14199, %r14200, %r14201}, [%r14167]; + // end inline asm + add.s32 %r14172, %r14142, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14202, %r14203, %r14204, %r14205}, [%r14172]; + // end inline asm + add.s32 %r14177, %r14142, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14206, %r14207, %r14208, %r14209}, [%r14177]; + // end inline asm + // begin inline asm + @%p1163 st.global.v4.b32 [ %rd1136 + 0 ], { %r14178, %r14179, %r14180, %r14181 }; + // end inline asm + // begin inline asm + @%p1164 st.global.v4.b32 [ %rd1137 + 0 ], { %r14182, %r14183, %r14184, %r14185 }; + // end inline asm + // begin inline asm + @%p1165 st.global.v4.b32 [ %rd1138 + 0 ], { %r14186, %r14187, %r14188, %r14189 }; + // end inline asm + // begin inline asm + @%p1166 st.global.v4.b32 [ %rd1139 + 0 ], { %r14190, %r14191, %r14192, %r14193 }; + // end inline asm + // begin inline asm + @%p1167 st.global.v4.b32 [ %rd1140 + 0 ], { %r14194, %r14195, %r14196, %r14197 }; + // end inline asm + // begin inline asm + @%p1168 st.global.v4.b32 [ %rd1141 + 0 ], { %r14198, %r14199, %r14200, %r14201 }; + // end inline asm + // begin inline asm + @%p1169 st.global.v4.b32 [ %rd1142 + 0 ], { %r14202, %r14203, %r14204, %r14205 }; + // end inline asm + // begin inline asm + @%p1170 st.global.v4.b32 [ %rd1143 + 0 ], { %r14206, %r14207, %r14208, %r14209 }; + // end inline asm + .loc 1 334 14 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:334:14 + mul.f32 %r14337, %r14988, 0f3DB504F3; + mul.f32 %r14338, %r14989, 0f3DB504F3; + mul.f32 %r14339, %r14990, 0f3DB504F3; + mul.f32 %r14340, %r14991, 0f3DB504F3; + mul.f32 %r14341, %r14992, 0f3DB504F3; + mul.f32 %r14342, %r14993, 0f3DB504F3; + mul.f32 %r14343, %r14994, 0f3DB504F3; + mul.f32 %r14344, %r14995, 0f3DB504F3; + mul.f32 %r14345, %r14996, 0f3DB504F3; + mul.f32 %r14346, %r14997, 0f3DB504F3; + mul.f32 %r14347, %r14998, 0f3DB504F3; + mul.f32 %r14348, %r14999, 0f3DB504F3; + mul.f32 %r14349, %r15000, 0f3DB504F3; + mul.f32 %r14350, %r15001, 0f3DB504F3; + mul.f32 %r14351, %r15002, 0f3DB504F3; + mul.f32 %r14352, %r15003, 0f3DB504F3; + mul.f32 %r14353, %r15004, 0f3DB504F3; + mul.f32 %r14354, %r15005, 0f3DB504F3; + mul.f32 %r14355, %r15006, 0f3DB504F3; + mul.f32 %r14356, %r15007, 0f3DB504F3; + mul.f32 %r14357, %r15008, 0f3DB504F3; + mul.f32 %r14358, %r15009, 0f3DB504F3; + mul.f32 %r14359, %r15010, 0f3DB504F3; + mul.f32 %r14360, %r15011, 0f3DB504F3; + mul.f32 %r14361, %r15012, 0f3DB504F3; + mul.f32 %r14362, %r15013, 0f3DB504F3; + mul.f32 %r14363, %r15014, 0f3DB504F3; + mul.f32 %r14364, %r15015, 0f3DB504F3; + mul.f32 %r14365, %r15016, 0f3DB504F3; + mul.f32 %r14366, %r15017, 0f3DB504F3; + mul.f32 %r14367, %r15018, 0f3DB504F3; + mul.f32 %r14368, %r15019, 0f3DB504F3; + mul.f32 %r14369, %r15020, 0f3DB504F3; + mul.f32 %r14370, %r15021, 0f3DB504F3; + mul.f32 %r14371, %r15022, 0f3DB504F3; + mul.f32 %r14372, %r15023, 0f3DB504F3; + mul.f32 %r14373, %r15024, 0f3DB504F3; + mul.f32 %r14374, %r15025, 0f3DB504F3; + mul.f32 %r14375, %r15026, 0f3DB504F3; + mul.f32 %r14376, %r15027, 0f3DB504F3; + mul.f32 %r14377, %r15028, 0f3DB504F3; + mul.f32 %r14378, %r15029, 0f3DB504F3; + mul.f32 %r14379, %r15030, 0f3DB504F3; + mul.f32 %r14380, %r15031, 0f3DB504F3; + mul.f32 %r14381, %r15032, 0f3DB504F3; + mul.f32 %r14382, %r15033, 0f3DB504F3; + mul.f32 %r14383, %r15034, 0f3DB504F3; + mul.f32 %r14384, %r15035, 0f3DB504F3; + mul.f32 %r14385, %r15036, 0f3DB504F3; + mul.f32 %r14386, %r15037, 0f3DB504F3; + mul.f32 %r14387, %r15038, 0f3DB504F3; + mul.f32 %r14388, %r15039, 0f3DB504F3; + mul.f32 %r14389, %r15040, 0f3DB504F3; + mul.f32 %r14390, %r15041, 0f3DB504F3; + mul.f32 %r14391, %r15042, 0f3DB504F3; + mul.f32 %r14392, %r15043, 0f3DB504F3; + mul.f32 %r14393, %r15044, 0f3DB504F3; + mul.f32 %r14394, %r15045, 0f3DB504F3; + mul.f32 %r14395, %r15046, 0f3DB504F3; + mul.f32 %r14396, %r15047, 0f3DB504F3; + mul.f32 %r14397, %r15048, 0f3DB504F3; + mul.f32 %r14398, %r15049, 0f3DB504F3; + mul.f32 %r14399, %r15050, 0f3DB504F3; + mul.f32 %r14400, %r15051, 0f3DB504F3; + .loc 1 344 27 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:344:27 + or.b64 %rd1169, %rd71, %rd79; + cvt.u32.u64 %r14401, %rd1169; + or.b64 %rd1170, %rd72, %rd79; + cvt.u32.u64 %r14402, %rd1170; + or.b64 %rd1171, %rd73, %rd79; + cvt.u32.u64 %r14403, %rd1171; + or.b64 %rd1172, %rd74, %rd79; + cvt.u32.u64 %r14404, %rd1172; + or.b64 %rd1173, %rd75, %rd79; + cvt.u32.u64 %r14405, %rd1173; + or.b64 %rd1174, %rd76, %rd79; + cvt.u32.u64 %r14406, %rd1174; + or.b64 %rd1175, %rd77, %rd79; + cvt.u32.u64 %r14407, %rd1175; + or.b64 %rd1176, %rd78, %rd79; + cvt.u32.u64 %r14408, %rd1176; + .loc 1 344 59 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:344:59 + add.s32 %r14409, %r9, %r14401; + add.s32 %r14410, %r9, %r14402; + add.s32 %r14411, %r9, %r14403; + add.s32 %r14412, %r9, %r14404; + add.s32 %r14413, %r9, %r14405; + add.s32 %r14414, %r9, %r14406; + add.s32 %r14415, %r9, %r14407; + add.s32 %r14416, %r9, %r14408; + .loc 1 345 29 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:345:29 + mad.wide.s32 %rd1144, %r14409, 2, %rd208; + mad.wide.s32 %rd1145, %r14410, 2, %rd208; + mad.wide.s32 %rd1146, %r14411, 2, %rd208; + mad.wide.s32 %rd1147, %r14412, 2, %rd208; + mad.wide.s32 %rd1148, %r14413, 2, %rd208; + mad.wide.s32 %rd1149, %r14414, 2, %rd208; + mad.wide.s32 %rd1150, %r14415, 2, %rd208; + mad.wide.s32 %rd1151, %r14416, 2, %rd208; + .loc 1 345 69 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:345:69 + cvt.rn.bf16x2.f32 %r14417, %r14338, %r14337; + cvt.rn.bf16x2.f32 %r14418, %r14340, %r14339; + cvt.rn.bf16x2.f32 %r14419, %r14342, %r14341; + cvt.rn.bf16x2.f32 %r14420, %r14344, %r14343; + cvt.rn.bf16x2.f32 %r14421, %r14346, %r14345; + cvt.rn.bf16x2.f32 %r14422, %r14348, %r14347; + cvt.rn.bf16x2.f32 %r14423, %r14350, %r14349; + cvt.rn.bf16x2.f32 %r14424, %r14352, %r14351; + cvt.rn.bf16x2.f32 %r14425, %r14354, %r14353; + cvt.rn.bf16x2.f32 %r14426, %r14356, %r14355; + cvt.rn.bf16x2.f32 %r14427, %r14358, %r14357; + cvt.rn.bf16x2.f32 %r14428, %r14360, %r14359; + cvt.rn.bf16x2.f32 %r14429, %r14362, %r14361; + cvt.rn.bf16x2.f32 %r14430, %r14364, %r14363; + cvt.rn.bf16x2.f32 %r14431, %r14366, %r14365; + cvt.rn.bf16x2.f32 %r14432, %r14368, %r14367; + cvt.rn.bf16x2.f32 %r14433, %r14370, %r14369; + cvt.rn.bf16x2.f32 %r14434, %r14372, %r14371; + cvt.rn.bf16x2.f32 %r14435, %r14374, %r14373; + cvt.rn.bf16x2.f32 %r14436, %r14376, %r14375; + cvt.rn.bf16x2.f32 %r14437, %r14378, %r14377; + cvt.rn.bf16x2.f32 %r14438, %r14380, %r14379; + cvt.rn.bf16x2.f32 %r14439, %r14382, %r14381; + cvt.rn.bf16x2.f32 %r14440, %r14384, %r14383; + cvt.rn.bf16x2.f32 %r14441, %r14386, %r14385; + cvt.rn.bf16x2.f32 %r14442, %r14388, %r14387; + cvt.rn.bf16x2.f32 %r14443, %r14390, %r14389; + cvt.rn.bf16x2.f32 %r14444, %r14392, %r14391; + cvt.rn.bf16x2.f32 %r14445, %r14394, %r14393; + cvt.rn.bf16x2.f32 %r14446, %r14396, %r14395; + cvt.rn.bf16x2.f32 %r14447, %r14398, %r14397; + cvt.rn.bf16x2.f32 %r14448, %r14400, %r14399; + bar.sync 0; + st.shared.v4.b32 [%r14325], {%r14417, %r14419, %r14421, %r14423}; + st.shared.v4.b32 [%r14325+512], {%r14418, %r14420, %r14422, %r14424}; + st.shared.v4.b32 [%r14327], {%r14425, %r14427, %r14429, %r14431}; + st.shared.v4.b32 [%r14327+512], {%r14426, %r14428, %r14430, %r14432}; + st.shared.v4.b32 [%r14329], {%r14433, %r14435, %r14437, %r14439}; + st.shared.v4.b32 [%r14329+512], {%r14434, %r14436, %r14438, %r14440}; + st.shared.v4.b32 [%r14331], {%r14441, %r14443, %r14445, %r14447}; + st.shared.v4.b32 [%r14331+512], {%r14442, %r14444, %r14446, %r14448}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14250, %r14251, %r14252, %r14253}, [%r14142]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14254, %r14255, %r14256, %r14257}, [%r14147]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14258, %r14259, %r14260, %r14261}, [%r14152]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14262, %r14263, %r14264, %r14265}, [%r14157]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14266, %r14267, %r14268, %r14269}, [%r14162]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14270, %r14271, %r14272, %r14273}, [%r14167]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14274, %r14275, %r14276, %r14277}, [%r14172]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14278, %r14279, %r14280, %r14281}, [%r14177]; + // end inline asm + // begin inline asm + @%p1163 st.global.v4.b32 [ %rd1144 + 0 ], { %r14250, %r14251, %r14252, %r14253 }; + // end inline asm + // begin inline asm + @%p1164 st.global.v4.b32 [ %rd1145 + 0 ], { %r14254, %r14255, %r14256, %r14257 }; + // end inline asm + // begin inline asm + @%p1165 st.global.v4.b32 [ %rd1146 + 0 ], { %r14258, %r14259, %r14260, %r14261 }; + // end inline asm + // begin inline asm + @%p1166 st.global.v4.b32 [ %rd1147 + 0 ], { %r14262, %r14263, %r14264, %r14265 }; + // end inline asm + // begin inline asm + @%p1167 st.global.v4.b32 [ %rd1148 + 0 ], { %r14266, %r14267, %r14268, %r14269 }; + // end inline asm + // begin inline asm + @%p1168 st.global.v4.b32 [ %rd1149 + 0 ], { %r14270, %r14271, %r14272, %r14273 }; + // end inline asm + // begin inline asm + @%p1169 st.global.v4.b32 [ %rd1150 + 0 ], { %r14274, %r14275, %r14276, %r14277 }; + // end inline asm + // begin inline asm + @%p1170 st.global.v4.b32 [ %rd1151 + 0 ], { %r14278, %r14279, %r14280, %r14281 }; + // end inline asm +$L__BB0_17: + .loc 1 139 4 // c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py:139:4 + ret; +$L__tmp36: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vbm66z3map72ysgiduadjtps3nnrhjldngw5bzue3cm5xo44w5.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 6 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 452 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1bd DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 118 +.b8 98 +.b8 109 +.b8 54 +.b8 54 +.b8 122 +.b8 51 +.b8 109 +.b8 97 +.b8 112 +.b8 55 +.b8 50 +.b8 121 +.b8 115 +.b8 103 +.b8 105 +.b8 100 +.b8 117 +.b8 97 +.b8 100 +.b8 106 +.b8 116 +.b8 112 +.b8 115 +.b8 51 +.b8 110 +.b8 110 +.b8 114 +.b8 104 +.b8 106 +.b8 108 +.b8 100 +.b8 110 +.b8 103 +.b8 119 +.b8 53 +.b8 98 +.b8 122 +.b8 117 +.b8 101 +.b8 51 +.b8 99 +.b8 109 +.b8 53 +.b8 120 +.b8 111 +.b8 52 +.b8 52 +.b8 119 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 50 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x121 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 112 // DW_AT_call_line +.b8 36 // DW_AT_call_column +.b8 5 // Abbrev [5] 0xd3:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 5 // Abbrev [5] 0xec:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 1 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x105:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp6 // DW_AT_low_pc +.b64 $L__tmp18 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 1 +.b8 16 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x11e:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp19 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 62 // DW_AT_call_line +.b8 1 +.b8 20 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x137:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp20 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 113 // DW_AT_call_line +.b8 34 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14f:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp22 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 178 // DW_AT_call_line +.b8 107 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x167:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp23 // DW_AT_low_pc +.b64 $L__tmp24 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 179 // DW_AT_call_line +.b8 111 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x17f:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp25 // DW_AT_low_pc +.b64 $L__tmp31 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 207 // DW_AT_call_line +.b8 12 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x197:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp29 // DW_AT_low_pc +.b64 $L__tmp30 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 4 // Abbrev [4] 0x1ae:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp32 // DW_AT_low_pc +.b64 $L__tmp35 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 226 // DW_AT_call_line +.b8 16 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..c3f8197fa09648d66e3156d61da94ea9649c6bb1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.source @@ -0,0 +1,2294 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":18:0) +#loc227 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":812:0) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":348:0) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":423:0) +#loc353 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":797:0) +#loc357 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":781:0) +#loc378 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":559:0) +#loc408 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":634:0) +#loc486 = loc("arg_Q"(#loc)) +#loc487 = loc("arg_K"(#loc)) +#loc488 = loc("arg_V"(#loc)) +#loc489 = loc("arg_LSE"(#loc)) +#loc490 = loc("arg_DELTA"(#loc)) +#loc491 = loc("arg_DO"(#loc)) +#loc492 = loc("arg_DQ"(#loc)) +#loc493 = loc("arg_DV"(#loc)) +#loc494 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc495 = loc("arg_KV_IDX"(#loc)) +#loc496 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc497 = loc("arg_Q_IDX"(#loc)) +#loc498 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc499 = loc("arg_FULL_KV_IDX"(#loc)) +#loc500 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc501 = loc("arg_FULL_Q_IDX"(#loc)) +#loc502 = loc("in_ptr16"(#loc)) +#loc503 = loc("out_ptr0"(#loc)) +#loc504 = loc("ks0"(#loc)) +#loc505 = loc("ks1"(#loc)) +#loc506 = loc("ks2"(#loc)) +#loc507 = loc("ks3"(#loc)) +#loc508 = loc("ks4"(#loc)) +#loc509 = loc("ks5"(#loc)) +#loc510 = loc("ks6"(#loc)) +#loc511 = loc("ks7"(#loc)) +#loc512 = loc("ks8"(#loc)) +#loc695 = loc("x"(#loc227)) +#loc696 = loc("ptr"(#loc237)) +#loc697 = loc("offs_m"(#loc237)) +#loc698 = loc("offs_n"(#loc237)) +#loc699 = loc("stride_m"(#loc237)) +#loc700 = loc("stride_n"(#loc237)) +#loc701 = loc("M_LEN"(#loc237)) +#loc708 = loc("arg_Q"(#loc249)) +#loc709 = loc("arg_K"(#loc249)) +#loc710 = loc("arg_V"(#loc249)) +#loc711 = loc("arg_LSE"(#loc249)) +#loc712 = loc("arg_DELTA"(#loc249)) +#loc713 = loc("arg_DO"(#loc249)) +#loc714 = loc("arg_DQ"(#loc249)) +#loc715 = loc("arg_DV"(#loc249)) +#loc716 = loc("arg_KV_NUM_BLKS"(#loc249)) +#loc717 = loc("arg_KV_IDX"(#loc249)) +#loc718 = loc("arg_Q_NUM_BLKS"(#loc249)) +#loc719 = loc("arg_Q_IDX"(#loc249)) +#loc720 = loc("arg_FULL_KV_NUM_BLKS"(#loc249)) +#loc721 = loc("arg_FULL_KV_IDX"(#loc249)) +#loc722 = loc("arg_FULL_Q_NUM_BLKS"(#loc249)) +#loc723 = loc("arg_FULL_Q_IDX"(#loc249)) +#loc724 = loc("in_ptr16"(#loc249)) +#loc725 = loc("out_ptr0"(#loc249)) +#loc726 = loc("ks0"(#loc249)) +#loc727 = loc("ks1"(#loc249)) +#loc728 = loc("ks2"(#loc249)) +#loc729 = loc("ks3"(#loc249)) +#loc730 = loc("ks4"(#loc249)) +#loc731 = loc("ks5"(#loc249)) +#loc732 = loc("ks6"(#loc249)) +#loc733 = loc("ks7"(#loc249)) +#loc734 = loc("ks8"(#loc249)) +#loc735 = loc("K"(#loc249)) +#loc736 = loc("V"(#loc249)) +#loc737 = loc("dq"(#loc249)) +#loc738 = loc("q"(#loc249)) +#loc739 = loc("do"(#loc249)) +#loc740 = loc("Di"(#loc249)) +#loc741 = loc("lse"(#loc249)) +#loc742 = loc("off_z"(#loc249)) +#loc743 = loc("off_hq"(#loc249)) +#loc744 = loc("offs_m2"(#loc249)) +#loc745 = loc("offs_n2"(#loc249)) +#loc746 = loc("stride_kn"(#loc249)) +#loc747 = loc("stride_kd"(#loc249)) +#loc748 = loc("stride_vn"(#loc249)) +#loc749 = loc("stride_vd"(#loc249)) +#loc750 = loc("kv_indices"(#loc249)) +#loc751 = loc("sparse_kv_num_blocks"(#loc249)) +#loc778 = loc("arg_Q"(#loc279)) +#loc779 = loc("arg_K"(#loc279)) +#loc780 = loc("arg_V"(#loc279)) +#loc781 = loc("arg_LSE"(#loc279)) +#loc782 = loc("arg_DELTA"(#loc279)) +#loc783 = loc("arg_DO"(#loc279)) +#loc784 = loc("arg_DQ"(#loc279)) +#loc785 = loc("arg_DV"(#loc279)) +#loc786 = loc("arg_KV_NUM_BLKS"(#loc279)) +#loc787 = loc("arg_KV_IDX"(#loc279)) +#loc788 = loc("arg_Q_NUM_BLKS"(#loc279)) +#loc789 = loc("arg_Q_IDX"(#loc279)) +#loc790 = loc("arg_FULL_KV_NUM_BLKS"(#loc279)) +#loc791 = loc("arg_FULL_KV_IDX"(#loc279)) +#loc792 = loc("arg_FULL_Q_NUM_BLKS"(#loc279)) +#loc793 = loc("arg_FULL_Q_IDX"(#loc279)) +#loc794 = loc("in_ptr16"(#loc279)) +#loc795 = loc("out_ptr0"(#loc279)) +#loc796 = loc("ks0"(#loc279)) +#loc797 = loc("ks1"(#loc279)) +#loc798 = loc("ks2"(#loc279)) +#loc799 = loc("ks3"(#loc279)) +#loc800 = loc("ks4"(#loc279)) +#loc801 = loc("ks5"(#loc279)) +#loc802 = loc("ks6"(#loc279)) +#loc803 = loc("ks7"(#loc279)) +#loc804 = loc("ks8"(#loc279)) +#loc805 = loc("dq"(#loc279)) +#loc806 = loc("q"(#loc279)) +#loc807 = loc("kT_ptrs"(#loc279)) +#loc808 = loc("vT_ptrs"(#loc279)) +#loc809 = loc("do"(#loc279)) +#loc810 = loc("Di"(#loc279)) +#loc811 = loc("lse"(#loc279)) +#loc812 = loc("Q_LEN"(#loc279)) +#loc813 = loc("KV_LEN"(#loc279)) +#loc814 = loc("off_z"(#loc279)) +#loc815 = loc("off_hq"(#loc279)) +#loc816 = loc("offs_m2"(#loc279)) +#loc817 = loc("offs_n2"(#loc279)) +#loc818 = loc("offs_k"(#loc279)) +#loc819 = loc("offs_v"(#loc279)) +#loc820 = loc("stride_kn"(#loc279)) +#loc821 = loc("stride_kd"(#loc279)) +#loc822 = loc("stride_vn"(#loc279)) +#loc823 = loc("stride_vd"(#loc279)) +#loc824 = loc("kv_indices"(#loc279)) +#loc825 = loc("sparse_kv_num_blocks"(#loc279)) +#loc893 = loc("N_LEN"(#loc237)) +#loc894 = loc("indices"(#loc353)) +#loc895 = loc("max_len"(#loc353)) +#loc896 = loc("loop_iter"(#loc357)) +#loc897 = loc("col_indices"(#loc357)) +#loc898 = loc("total_blocks"(#loc357)) +#loc917 = loc("arg_Q"(#loc378)) +#loc918 = loc("arg_K"(#loc378)) +#loc919 = loc("arg_V"(#loc378)) +#loc920 = loc("arg_LSE"(#loc378)) +#loc921 = loc("arg_DELTA"(#loc378)) +#loc922 = loc("arg_DO"(#loc378)) +#loc923 = loc("arg_DQ"(#loc378)) +#loc924 = loc("arg_DV"(#loc378)) +#loc925 = loc("arg_KV_NUM_BLKS"(#loc378)) +#loc926 = loc("arg_KV_IDX"(#loc378)) +#loc927 = loc("arg_Q_NUM_BLKS"(#loc378)) +#loc928 = loc("arg_Q_IDX"(#loc378)) +#loc929 = loc("arg_FULL_KV_NUM_BLKS"(#loc378)) +#loc930 = loc("arg_FULL_KV_IDX"(#loc378)) +#loc931 = loc("arg_FULL_Q_NUM_BLKS"(#loc378)) +#loc932 = loc("arg_FULL_Q_IDX"(#loc378)) +#loc933 = loc("in_ptr16"(#loc378)) +#loc934 = loc("out_ptr0"(#loc378)) +#loc935 = loc("ks0"(#loc378)) +#loc936 = loc("ks1"(#loc378)) +#loc937 = loc("ks2"(#loc378)) +#loc938 = loc("ks3"(#loc378)) +#loc939 = loc("ks4"(#loc378)) +#loc940 = loc("ks5"(#loc378)) +#loc941 = loc("ks6"(#loc378)) +#loc942 = loc("ks7"(#loc378)) +#loc943 = loc("ks8"(#loc378)) +#loc944 = loc("Q"(#loc378)) +#loc945 = loc("DO"(#loc378)) +#loc946 = loc("DELTA"(#loc378)) +#loc947 = loc("LSE"(#loc378)) +#loc948 = loc("dk"(#loc378)) +#loc949 = loc("dv"(#loc378)) +#loc950 = loc("k"(#loc378)) +#loc951 = loc("v"(#loc378)) +#loc952 = loc("off_z"(#loc378)) +#loc953 = loc("off_hq"(#loc378)) +#loc954 = loc("offs_n1"(#loc378)) +#loc955 = loc("offs_m1"(#loc378)) +#loc956 = loc("stride_qm"(#loc378)) +#loc957 = loc("stride_qd"(#loc378)) +#loc958 = loc("stride_dom"(#loc378)) +#loc959 = loc("stride_dod"(#loc378)) +#loc960 = loc("q_indices"(#loc378)) +#loc961 = loc("sparse_q_num_blocks"(#loc378)) +#loc987 = loc("arg_Q"(#loc408)) +#loc988 = loc("arg_K"(#loc408)) +#loc989 = loc("arg_V"(#loc408)) +#loc990 = loc("arg_LSE"(#loc408)) +#loc991 = loc("arg_DELTA"(#loc408)) +#loc992 = loc("arg_DO"(#loc408)) +#loc993 = loc("arg_DQ"(#loc408)) +#loc994 = loc("arg_DV"(#loc408)) +#loc995 = loc("arg_KV_NUM_BLKS"(#loc408)) +#loc996 = loc("arg_KV_IDX"(#loc408)) +#loc997 = loc("arg_Q_NUM_BLKS"(#loc408)) +#loc998 = loc("arg_Q_IDX"(#loc408)) +#loc999 = loc("arg_FULL_KV_NUM_BLKS"(#loc408)) +#loc1000 = loc("arg_FULL_KV_IDX"(#loc408)) +#loc1001 = loc("arg_FULL_Q_NUM_BLKS"(#loc408)) +#loc1002 = loc("arg_FULL_Q_IDX"(#loc408)) +#loc1003 = loc("in_ptr16"(#loc408)) +#loc1004 = loc("out_ptr0"(#loc408)) +#loc1005 = loc("ks0"(#loc408)) +#loc1006 = loc("ks1"(#loc408)) +#loc1007 = loc("ks2"(#loc408)) +#loc1008 = loc("ks3"(#loc408)) +#loc1009 = loc("ks4"(#loc408)) +#loc1010 = loc("ks5"(#loc408)) +#loc1011 = loc("ks6"(#loc408)) +#loc1012 = loc("ks7"(#loc408)) +#loc1013 = loc("ks8"(#loc408)) +#loc1014 = loc("dk"(#loc408)) +#loc1015 = loc("dv"(#loc408)) +#loc1016 = loc("qT_ptrs"(#loc408)) +#loc1017 = loc("k"(#loc408)) +#loc1018 = loc("v"(#loc408)) +#loc1019 = loc("do_ptrs"(#loc408)) +#loc1020 = loc("DELTA"(#loc408)) +#loc1021 = loc("LSE"(#loc408)) +#loc1022 = loc("Q_LEN"(#loc408)) +#loc1023 = loc("KV_LEN"(#loc408)) +#loc1024 = loc("off_z"(#loc408)) +#loc1025 = loc("off_hq"(#loc408)) +#loc1026 = loc("offs_n1"(#loc408)) +#loc1027 = loc("offs_m1"(#loc408)) +#loc1028 = loc("offs_k"(#loc408)) +#loc1029 = loc("offs_v"(#loc408)) +#loc1030 = loc("stride_qm"(#loc408)) +#loc1031 = loc("stride_qd"(#loc408)) +#loc1032 = loc("stride_dom"(#loc408)) +#loc1033 = loc("stride_dod"(#loc408)) +#loc1034 = loc("q_indices"(#loc408)) +#loc1035 = loc("sparse_q_num_blocks"(#loc408)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc)), %ks6: i32 loc("ks6"(#loc)), %ks7: i32 loc("ks7"(#loc)), %ks8: i32 loc("ks8"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c4096_i32_0 = arith.constant 4096 : i32 loc(#loc1) + %0 = arith.muli %c4096_i32_0, %ks0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c4096_i32_1 = arith.constant 4096 : i32 loc(#loc2) + %c1_i32 = arith.constant 1 : i32 loc(#loc2) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %c1024_i32_2 = arith.constant 1024 : i32 loc(#loc3) + %1 = arith.muli %c1024_i32_2, %ks1 : i32 loc(#loc3) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc4) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc4) + %2 = arith.muli %c128_i32_4, %ks1 : i32 loc(#loc4) + %c128_i32_5 = arith.constant 128 : i32 loc(#loc5) + %c1_i32_6 = arith.constant 1 : i32 loc(#loc5) + %c1024_i32_7 = arith.constant 1024 : i32 loc(#loc6) + %c1024_i32_8 = arith.constant 1024 : i32 loc(#loc6) + %3 = arith.muli %c1024_i32_8, %ks1 : i32 loc(#loc6) + %c128_i32_9 = arith.constant 128 : i32 loc(#loc7) + %c128_i32_10 = arith.constant 128 : i32 loc(#loc7) + %4 = arith.muli %c128_i32_10, %ks1 : i32 loc(#loc7) + %c128_i32_11 = arith.constant 128 : i32 loc(#loc8) + %c1_i32_12 = arith.constant 1 : i32 loc(#loc8) + %c1_i32_13 = arith.constant 1 : i32 loc(#loc9) + %5 = arith.cmpi sge, %c1_i32_13, %ks0 : i32 loc(#loc9) + %c1_i32_14 = arith.constant 1 : i32 loc(#loc10) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc10) + %6 = arith.extui %5 : i1 to i32 loc(#loc10) + %7 = arith.muli %c1_i32_15, %6 : i32 loc(#loc10) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc11) + %8 = arith.cmpi sgt, %ks0, %c1_i32_16 : i32 loc(#loc11) + %9 = arith.extui %8 : i1 to i32 loc(#loc12) + %10 = arith.muli %ks0, %9 : i32 loc(#loc12) + %11 = arith.addi %7, %10 : i32 loc(#loc13) + %c4096_i32_17 = arith.constant 4096 : i32 loc(#loc14) + %c4096_i32_18 = arith.constant 4096 : i32 loc(#loc14) + %12 = arith.muli %c4096_i32_18, %11 : i32 loc(#loc14) + %c1_i32_19 = arith.constant 1 : i32 loc(#loc15) + %13 = arith.cmpi sge, %c1_i32_19, %ks0 : i32 loc(#loc15) + %c1_i32_20 = arith.constant 1 : i32 loc(#loc16) + %c1_i32_21 = arith.constant 1 : i32 loc(#loc16) + %14 = arith.extui %13 : i1 to i32 loc(#loc16) + %15 = arith.muli %c1_i32_21, %14 : i32 loc(#loc16) + %c1_i32_22 = arith.constant 1 : i32 loc(#loc17) + %16 = arith.cmpi sgt, %ks0, %c1_i32_22 : i32 loc(#loc17) + %17 = arith.extui %16 : i1 to i32 loc(#loc18) + %18 = arith.muli %ks0, %17 : i32 loc(#loc18) + %19 = arith.addi %15, %18 : i32 loc(#loc19) + %c128_i32_23 = arith.constant 128 : i32 loc(#loc20) + %c128_i32_24 = arith.constant 128 : i32 loc(#loc20) + %20 = arith.muli %c128_i32_24, %19 : i32 loc(#loc20) + %c128_i32_25 = arith.constant 128 : i32 loc(#loc21) + %c1_i32_26 = arith.constant 1 : i32 loc(#loc21) + %c4096_i32_27 = arith.constant 4096 : i32 loc(#loc22) + %c4096_i32_28 = arith.constant 4096 : i32 loc(#loc22) + %21 = arith.muli %c4096_i32_28, %ks0 : i32 loc(#loc22) + %c128_i32_29 = arith.constant 128 : i32 loc(#loc23) + %c4096_i32_30 = arith.constant 4096 : i32 loc(#loc23) + %c1_i32_31 = arith.constant 1 : i32 loc(#loc23) + %c1024_i32_32 = arith.constant 1024 : i32 loc(#loc24) + %c1024_i32_33 = arith.constant 1024 : i32 loc(#loc24) + %22 = arith.muli %c1024_i32_33, %ks1 : i32 loc(#loc24) + %c128_i32_34 = arith.constant 128 : i32 loc(#loc25) + %c128_i32_35 = arith.constant 128 : i32 loc(#loc25) + %23 = arith.muli %c128_i32_35, %ks1 : i32 loc(#loc25) + %c128_i32_36 = arith.constant 128 : i32 loc(#loc26) + %c1_i32_37 = arith.constant 1 : i32 loc(#loc26) + %ZQ = arith.constant 8 : i32 loc(#loc513) + %HQ = arith.constant 32 : i32 loc(#loc514) + %HKV = arith.constant 8 : i32 loc(#loc515) + %ZKV = arith.constant 8 : i32 loc(#loc516) + %pid = tt.get_program_id x : i32 loc(#loc517) + %NUM_KV_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%ks1) : (i32) -> i32 loc(#loc518) + %NUM_Q_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%ks0) : (i32) -> i32 loc(#loc519) + %off_zq = tt.get_program_id y : i32 loc(#loc520) + %off_hkv = tt.get_program_id z : i32 loc(#loc521) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc522) + %SPARSE_Z = arith.constant 8 : i32 loc(#loc523) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc524) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc525) + %k_adj = arith.muli %2, %off_hkv : i32 loc(#loc526) + %k_adj_38 = arith.muli %1, %off_zkv : i32 loc(#loc527) + %k_adj_39 = arith.addi %k_adj, %k_adj_38 : i32 loc(#loc528) + %k_adj_40 = arith.extsi %k_adj_39 : i32 to i64 loc(#loc529) + %v_adj = arith.muli %4, %off_hkv : i32 loc(#loc530) + %v_adj_41 = arith.muli %3, %off_zkv : i32 loc(#loc531) + %v_adj_42 = arith.addi %v_adj, %v_adj_41 : i32 loc(#loc532) + %v_adj_43 = arith.extsi %v_adj_42 : i32 to i64 loc(#loc533) + %dv_adj = arith.muli %23, %off_hkv : i32 loc(#loc534) + %dv_adj_44 = arith.muli %22, %off_zq : i32 loc(#loc535) + %dv_adj_45 = arith.addi %dv_adj, %dv_adj_44 : i32 loc(#loc536) + %dv_adj_46 = arith.extsi %dv_adj_45 : i32 to i64 loc(#loc537) + %K = tt.addptr %arg_K, %k_adj_40 : !tt.ptr, i64 loc(#loc538) + %V = tt.addptr %arg_V, %v_adj_43 : !tt.ptr, i64 loc(#loc539) + %DV = tt.addptr %arg_DV, %dv_adj_46 : !tt.ptr, i64 loc(#loc540) + %RCP_LN2 = arith.constant 1.44269502 : f32 loc(#loc541) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc542) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc543) + %24 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS : i32 loc(#loc58) + %25:2 = scf.if %24 -> (i32, i32) { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS : i32 loc(#loc544) + %SPARSE_Q_MULTIPLE = arith.constant 1 : i32 loc(#loc1111) + %SPARSE_KV_MULTIPLE = arith.constant 2 : i32 loc(#loc1112) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc547) + %off_hq2_47 = arith.constant 4 : i32 loc(#loc548) + %off_hq2_48 = arith.constant 4 : i32 loc(#loc548) + %off_hq2_49 = arith.muli %off_hkv, %off_hq2_48 : i32 loc(#loc548) + %off_hq2_50 = arith.addi %off_hq2, %off_hq2_49 : i32 loc(#loc549) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc550) + %off_pid_mask = arith.divsi %start_m2_block, %SPARSE_Q_MULTIPLE : i32 loc(#loc551) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc552) + %sparse_idx_hq2 = arith.remsi %off_hq2_50, %SPARSE_HQ : i32 loc(#loc553) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc554) + %sparse_hz_offset_51 = arith.addi %sparse_hz_offset, %sparse_idx_hq2 : i32 loc(#loc555) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_51, %ks2 : i32 loc(#loc556) + %sparse_kv_num_blks_offset_52 = arith.addi %sparse_kv_num_blks_offset, %off_pid_mask : i32 loc(#loc557) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_51, %stride_kv_idx_h : i32 loc(#loc558) + %sparse_kv_idx_offset_53 = arith.muli %off_pid_mask, %ks4 : i32 loc(#loc559) + %sparse_kv_idx_offset_54 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_53 : i32 loc(#loc560) + %q_adj2 = arith.muli %c128_i32, %off_hq2_50 : i32 loc(#loc561) + %q_adj2_55 = arith.muli %0, %off_zq : i32 loc(#loc562) + %q_adj2_56 = arith.addi %q_adj2, %q_adj2_55 : i32 loc(#loc563) + %q_adj2_57 = arith.extsi %q_adj2_56 : i32 to i64 loc(#loc564) + %do_adj2 = arith.muli %20, %off_hq2_50 : i32 loc(#loc565) + %do_adj2_58 = arith.muli %12, %off_zq : i32 loc(#loc566) + %do_adj2_59 = arith.addi %do_adj2, %do_adj2_58 : i32 loc(#loc567) + %do_adj2_60 = arith.extsi %do_adj2_59 : i32 to i64 loc(#loc568) + %dq_adj2 = arith.muli %c128_i32_29, %off_hq2_50 : i32 loc(#loc569) + %dq_adj2_61 = arith.muli %21, %off_zq : i32 loc(#loc570) + %dq_adj2_62 = arith.addi %dq_adj2, %dq_adj2_61 : i32 loc(#loc571) + %dq_adj2_63 = arith.extsi %dq_adj2_62 : i32 to i64 loc(#loc572) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc573) + %off_chz2_64 = arith.addi %off_chz2, %off_hq2_50 : i32 loc(#loc574) + %off_chz2_65 = arith.muli %off_chz2_64, %ks0 : i32 loc(#loc575) + %off_chz2_66 = arith.extsi %off_chz2_65 : i32 to i64 loc(#loc576) + %Q2 = tt.addptr %arg_Q, %q_adj2_57 : !tt.ptr, i64 loc(#loc577) + %DO2 = tt.addptr %arg_DO, %do_adj2_60 : !tt.ptr, i64 loc(#loc578) + %DQ2 = tt.addptr %arg_DQ, %dq_adj2_63 : !tt.ptr, i64 loc(#loc579) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_66 : !tt.ptr, i64 loc(#loc580) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_66 : !tt.ptr, i64 loc(#loc581) + %dq = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc582) + %start_m2 = arith.constant 128 : i32 loc(#loc583) + %start_m2_67 = arith.constant 128 : i32 loc(#loc583) + %start_m2_68 = arith.muli %start_m2_block, %start_m2_67 : i32 loc(#loc583) + %offs_m2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc584) + %offs_m2_69 = tt.splat %start_m2_68 : i32 -> tensor<128xi32> loc(#loc585) + %offs_m2_70 = arith.addi %offs_m2_69, %offs_m2 : tensor<128xi32> loc(#loc585) + %q = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q2, %offs_m2_70, %offs_k, %c4096_i32_1, %c1_i32, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc586) + %do = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%DO2, %offs_m2_70, %offs_v, %c128_i32_25, %c1_i32_26, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc587) + %Di = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc588) + %Di_71 = arith.cmpi slt, %offs_m2_70, %Di : tensor<128xi32> loc(#loc588) + %Di_72 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc589) + %Di_73 = tt.addptr %Di_72, %offs_m2_70 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc589) + %Di_74 = tt.load %Di_73, %Di_71 : tensor<128x!tt.ptr> loc(#loc590) + %lse = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc591) + %lse_75 = arith.cmpi slt, %offs_m2_70, %lse : tensor<128xi32> loc(#loc591) + %lse_76 = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc592) + %lse_77 = tt.addptr %lse_76, %offs_m2_70 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc592) + %lse_78 = tt.load %lse_77, %lse_75 : tensor<128x!tt.ptr> loc(#loc593) + %lse_79 = arith.constant 0xFF800000 : f32 loc(#loc594) + %lse_80 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc594) + %lse_81 = arith.cmpf oeq, %lse_78, %lse_80 : tensor<128xf32> loc(#loc594) + %lse_82 = arith.constant 0.000000e+00 : f32 loc(#loc595) + %lse_83 = arith.constant 0.000000e+00 : f32 loc(#loc595) + %lse_84 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc595) + %lse_85 = arith.select %lse_81, %lse_84, %lse_78 : tensor<128xi1>, tensor<128xf32> loc(#loc595) + %lse_86 = tt.expand_dims %lse_85 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc596) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_54 : !tt.ptr, i32 loc(#loc597) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc598) + %kv_start_87 = arith.constant 128 : i32 loc(#loc599) + %kv_start_88 = arith.constant 128 : i32 loc(#loc599) + %kv_start_89 = arith.muli %kv_start, %kv_start_88 : i32 loc(#loc599) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_52 : !tt.ptr, i32 loc(#loc600) + %sparse_kv_num_blocks_90 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc601) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc602) + %offs_n2_91 = tt.splat %kv_start_89 : i32 -> tensor<64xi32> loc(#loc603) + %offs_n2_92 = arith.addi %offs_n2_91, %offs_n2 : tensor<64xi32> loc(#loc603) + %dq_93 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %K, %V, %dq, %q, %do, %Di_74, %lse_86, %off_zq, %off_hq2_50, %offs_m2_70, %offs_n2_92, %c128_i32_5, %c1_i32_6, %c128_i32_11, %c1_i32_12, %kv_indices, %sparse_kv_num_blocks_90) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc604) + %kv_indices_94 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_54 : !tt.ptr, i32 loc(#loc605) + %kv_start_95 = tt.load %kv_indices_94 : !tt.ptr loc(#loc606) + %kv_start_96 = arith.constant 128 : i32 loc(#loc607) + %kv_start_97 = arith.constant 128 : i32 loc(#loc607) + %kv_start_98 = arith.muli %kv_start_95, %kv_start_97 : i32 loc(#loc607) + %sparse_kv_num_blocks_99 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_52 : !tt.ptr, i32 loc(#loc608) + %sparse_kv_num_blocks_100 = tt.load %sparse_kv_num_blocks_99 : !tt.ptr loc(#loc609) + %offs_n2_101 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc610) + %offs_n2_102 = tt.splat %kv_start_98 : i32 -> tensor<64xi32> loc(#loc611) + %offs_n2_103 = arith.addi %offs_n2_102, %offs_n2_101 : tensor<64xi32> loc(#loc611) + %dq_104 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %K, %V, %dq_93, %q, %do, %Di_74, %lse_86, %off_zq, %off_hq2_50, %offs_m2_70, %offs_n2_103, %c128_i32_5, %c1_i32_6, %c128_i32_11, %c1_i32_12, %kv_indices_94, %sparse_kv_num_blocks_100) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc612) + %dq_ptrs = tt.expand_dims %offs_m2_70 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc613) + %dq_ptrs_105 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc614) + %dq_ptrs_106 = arith.muli %dq_ptrs, %dq_ptrs_105 : tensor<128x1xi32> loc(#loc614) + %dq_ptrs_107 = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc615) + %dq_ptrs_108 = tt.addptr %dq_ptrs_107, %dq_ptrs_106 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc615) + %dq_ptrs_109 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc616) + %dq_ptrs_110 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc617) + %dq_ptrs_111 = arith.muli %dq_ptrs_109, %dq_ptrs_110 : tensor<1x128xi32> loc(#loc617) + %dq_ptrs_112 = tt.broadcast %dq_ptrs_108 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc618) + %dq_ptrs_113 = tt.broadcast %dq_ptrs_111 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc618) + %dq_ptrs_114 = tt.addptr %dq_ptrs_112, %dq_ptrs_113 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc618) + %dq_115 = arith.constant 0.0883883461 : f32 loc(#loc619) + %dq_116 = arith.constant 0.0883883461 : f32 loc(#loc619) + %dq_117 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc619) + %dq_118 = arith.mulf %dq_104, %dq_117 : tensor<128x128xf32> loc(#loc619) + %26 = tt.expand_dims %offs_m2_70 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc136) + %27 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc137) + %28 = arith.cmpi slt, %26, %27 : tensor<128x1xi32> loc(#loc137) + %29 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc138) + %c128_i32_119 = arith.constant 128 : i32 loc(#loc139) + %cst = arith.constant dense<128> : tensor<1x128xi32> loc(#loc139) + %30 = arith.cmpi slt, %29, %cst : tensor<1x128xi32> loc(#loc139) + %31 = tt.broadcast %28 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc140) + %32 = tt.broadcast %30 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc140) + %33 = arith.andi %31, %32 : tensor<128x128xi1> loc(#loc140) + %34 = arith.truncf %dq_118 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc141) + tt.store %dq_ptrs_114, %34, %33 : tensor<128x128x!tt.ptr> loc(#loc141) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc141) + } else { + %SPARSE_Q_MULTIPLE = arith.constant 2 : i32 loc(#loc1113) + %SPARSE_KV_MULTIPLE = arith.constant 1 : i32 loc(#loc1114) + %pid_mask = arith.divsi %pid, %SPARSE_KV_MULTIPLE : i32 loc(#loc622) + %stride_q_idx_h = arith.muli %ks6, %ks7 : i32 loc(#loc623) + %dv = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc624) + %dk = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc625) + %start_n1 = arith.constant 128 : i32 loc(#loc626) + %start_n1_47 = arith.constant 128 : i32 loc(#loc626) + %start_n1_48 = arith.muli %pid, %start_n1_47 : i32 loc(#loc626) + %offs_n1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc627) + %offs_n1_49 = tt.splat %start_n1_48 : i32 -> tensor<128xi32> loc(#loc628) + %offs_n1_50 = arith.addi %offs_n1_49, %offs_n1 : tensor<128xi32> loc(#loc628) + %k = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n1_50, %offs_k, %c128_i32_5, %c1_i32_6, %ks1) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc629) + %v = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n1_50, %offs_v, %c128_i32_11, %c1_i32_12, %ks1) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc630) + %c0_i32 = arith.constant 0 : i32 loc(#loc153) + %c4_i32 = arith.constant 4 : i32 loc(#loc153) + %c1_i32_51 = arith.constant 1 : i32 loc(#loc153) + %26 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc153) + %27 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc153) + %28 = arith.bitcast %c1_i32_51 : i32 to i32 loc(#loc153) + %29 = ub.poison : i32 loc(#loc153) + %dk_52:2 = scf.for %off_g = %26 to %27 step %28 iter_args(%dv_87 = %dv, %dk_88 = %dk) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.constant 4 : i32 loc(#loc632) + %off_hq1_89 = arith.constant 4 : i32 loc(#loc632) + %off_hq1_90 = arith.muli %off_hkv, %off_hq1_89 : i32 loc(#loc632) + %off_hq1_91 = arith.addi %off_hq1_90, %off_g : i32 loc(#loc633) + %q_adj1 = arith.muli %c128_i32, %off_hq1_91 : i32 loc(#loc634) + %q_adj1_92 = arith.muli %0, %off_zq : i32 loc(#loc635) + %q_adj1_93 = arith.addi %q_adj1, %q_adj1_92 : i32 loc(#loc636) + %q_adj1_94 = arith.extsi %q_adj1_93 : i32 to i64 loc(#loc637) + %do_adj1 = arith.muli %20, %off_hq1_91 : i32 loc(#loc638) + %do_adj1_95 = arith.muli %12, %off_zq : i32 loc(#loc639) + %do_adj1_96 = arith.addi %do_adj1, %do_adj1_95 : i32 loc(#loc640) + %do_adj1_97 = arith.extsi %do_adj1_96 : i32 to i64 loc(#loc641) + %dq_adj1 = arith.muli %c128_i32_29, %off_hq1_91 : i32 loc(#loc642) + %dq_adj1_98 = arith.muli %21, %off_zq : i32 loc(#loc643) + %dq_adj1_99 = arith.addi %dq_adj1, %dq_adj1_98 : i32 loc(#loc644) + %dq_adj1_100 = arith.extsi %dq_adj1_99 : i32 to i64 loc(#loc645) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc646) + %off_chz1_101 = arith.addi %off_chz1, %off_hq1_91 : i32 loc(#loc647) + %off_chz1_102 = arith.muli %off_chz1_101, %ks0 : i32 loc(#loc648) + %off_chz1_103 = arith.extsi %off_chz1_102 : i32 to i64 loc(#loc649) + %Q1 = tt.addptr %arg_Q, %q_adj1_94 : !tt.ptr, i64 loc(#loc650) + %DO1 = tt.addptr %arg_DO, %do_adj1_97 : !tt.ptr, i64 loc(#loc651) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_103 : !tt.ptr, i64 loc(#loc652) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_103 : !tt.ptr, i64 loc(#loc653) + %sparse_idx_hq1 = arith.remsi %off_hq1_91, %SPARSE_HQ : i32 loc(#loc654) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc655) + %sparse_hz_offset_104 = arith.addi %sparse_hz_offset, %sparse_idx_hq1 : i32 loc(#loc656) + %sparse_q_num_blks_offset = arith.muli %sparse_hz_offset_104, %ks5 : i32 loc(#loc657) + %sparse_q_num_blks_offset_105 = arith.addi %sparse_q_num_blks_offset, %pid_mask : i32 loc(#loc658) + %sparse_q_idx_offset = arith.muli %sparse_hz_offset_104, %stride_q_idx_h : i32 loc(#loc659) + %sparse_q_idx_offset_106 = arith.muli %pid_mask, %ks6 : i32 loc(#loc660) + %sparse_q_idx_offset_107 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_106 : i32 loc(#loc661) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_107 : !tt.ptr, i32 loc(#loc662) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc663) + %q_start_108 = arith.constant 128 : i32 loc(#loc664) + %q_start_109 = arith.constant 128 : i32 loc(#loc664) + %q_start_110 = arith.muli %q_start, %q_start_109 : i32 loc(#loc664) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_105 : !tt.ptr, i32 loc(#loc665) + %sparse_q_num_blocks_111 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc666) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc667) + %offs_m1_112 = tt.splat %q_start_110 : i32 -> tensor<64xi32> loc(#loc668) + %offs_m1_113 = arith.addi %offs_m1_112, %offs_m1 : tensor<64xi32> loc(#loc668) + %41:2 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %Q1, %DO1, %DELTA1, %LSE1, %dk_88, %dv_87, %k, %v, %off_zq, %off_hq1_91, %offs_n1_50, %offs_m1_113, %c4096_i32_1, %c1_i32, %c128_i32_25, %c1_i32_26, %q_indices, %sparse_q_num_blocks_111) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc191) + %q_indices_114 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_107 : !tt.ptr, i32 loc(#loc669) + %q_start_115 = tt.load %q_indices_114 : !tt.ptr loc(#loc670) + %q_start_116 = arith.constant 128 : i32 loc(#loc671) + %q_start_117 = arith.constant 128 : i32 loc(#loc671) + %q_start_118 = arith.muli %q_start_115, %q_start_117 : i32 loc(#loc671) + %sparse_q_num_blocks_119 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_105 : !tt.ptr, i32 loc(#loc672) + %sparse_q_num_blocks_120 = tt.load %sparse_q_num_blocks_119 : !tt.ptr loc(#loc673) + %offs_m1_121 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc674) + %offs_m1_122 = tt.splat %q_start_118 : i32 -> tensor<64xi32> loc(#loc675) + %offs_m1_123 = arith.addi %offs_m1_122, %offs_m1_121 : tensor<64xi32> loc(#loc675) + %42:2 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %Q1, %DO1, %DELTA1, %LSE1, %41#0, %41#1, %k, %v, %off_zq, %off_hq1_91, %offs_n1_50, %offs_m1_123, %c4096_i32_1, %c1_i32, %c128_i32_25, %c1_i32_26, %q_indices_114, %sparse_q_num_blocks_120) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc199) + scf.yield %42#1, %42#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc200) + } loc(#loc1115) + %dv_ptrs = tt.expand_dims %offs_n1_50 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc676) + %dv_ptrs_53 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc677) + %dv_ptrs_54 = arith.muli %dv_ptrs, %dv_ptrs_53 : tensor<128x1xi32> loc(#loc677) + %dv_ptrs_55 = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc678) + %dv_ptrs_56 = tt.addptr %dv_ptrs_55, %dv_ptrs_54 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc678) + %dv_ptrs_57 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc679) + %dv_ptrs_58 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc680) + %dv_ptrs_59 = arith.muli %dv_ptrs_57, %dv_ptrs_58 : tensor<1x128xi32> loc(#loc680) + %dv_ptrs_60 = tt.broadcast %dv_ptrs_56 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc681) + %dv_ptrs_61 = tt.broadcast %dv_ptrs_59 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc681) + %dv_ptrs_62 = tt.addptr %dv_ptrs_60, %dv_ptrs_61 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc681) + %index_n = tt.expand_dims %offs_n1_50 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc682) + %index_k = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc683) + %index_v = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc684) + %30 = tt.splat %ks1 : i32 -> tensor<128x1xi32> loc(#loc210) + %31 = arith.cmpi slt, %index_n, %30 : tensor<128x1xi32> loc(#loc210) + %c128_i32_63 = arith.constant 128 : i32 loc(#loc211) + %cst = arith.constant dense<128> : tensor<1x128xi32> loc(#loc211) + %32 = arith.cmpi slt, %index_v, %cst : tensor<1x128xi32> loc(#loc211) + %33 = tt.broadcast %31 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc212) + %34 = tt.broadcast %32 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc212) + %35 = arith.andi %33, %34 : tensor<128x128xi1> loc(#loc212) + %36 = arith.truncf %dk_52#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc213) + tt.store %dv_ptrs_62, %36, %35 : tensor<128x128x!tt.ptr> loc(#loc213) + %dk_64 = arith.constant 0.0883883461 : f32 loc(#loc685) + %dk_65 = arith.constant 0.0883883461 : f32 loc(#loc685) + %dk_66 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc685) + %dk_67 = arith.mulf %dk_52#1, %dk_66 : tensor<128x128xf32> loc(#loc685) + %mask = tt.splat %ks1 : i32 -> tensor<128x1xi32> loc(#loc686) + %mask_68 = arith.cmpi slt, %index_n, %mask : tensor<128x1xi32> loc(#loc686) + %xindex = arith.constant 128 : i32 loc(#loc687) + %xindex_69 = arith.constant 128 : i32 loc(#loc687) + %xindex_70 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc687) + %xindex_71 = arith.muli %xindex_70, %index_n : tensor<128x1xi32> loc(#loc687) + %xindex_72 = tt.broadcast %index_k : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc688) + %xindex_73 = tt.broadcast %xindex_71 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc688) + %xindex_74 = arith.addi %xindex_72, %xindex_73 : tensor<128x128xi32> loc(#loc688) + %xindex_75 = arith.constant 128 : i32 loc(#loc689) + %xindex_76 = arith.constant 128 : i32 loc(#loc689) + %xindex_77 = arith.muli %xindex_76, %off_hkv : i32 loc(#loc689) + %xindex_78 = arith.muli %xindex_77, %ks1 : i32 loc(#loc690) + %xindex_79 = tt.splat %xindex_78 : i32 -> tensor<128x128xi32> loc(#loc691) + %xindex_80 = arith.addi %xindex_74, %xindex_79 : tensor<128x128xi32> loc(#loc691) + %xindex_81 = arith.constant 1024 : i32 loc(#loc692) + %xindex_82 = arith.constant 1024 : i32 loc(#loc692) + %xindex_83 = arith.muli %xindex_82, %off_zq : i32 loc(#loc692) + %xindex_84 = arith.muli %xindex_83, %ks1 : i32 loc(#loc693) + %xindex_85 = tt.splat %xindex_84 : i32 -> tensor<128x128xi32> loc(#loc694) + %xindex_86 = arith.addi %xindex_80, %xindex_85 : tensor<128x128xi32> loc(#loc694) + %37 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc224) + %38 = tt.addptr %37, %xindex_86 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc224) + %39 = tt.broadcast %mask_68 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc225) + %40 = arith.truncf %dk_67 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc225) + tt.store %38, %40, %39 : tensor<128x128x!tt.ptr> loc(#loc225) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc225) + } loc(#loc59) + tt.return loc(#loc226) + } loc(#loc) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%x: i32 loc("x"(#loc227))) -> i32 attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc228) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc228) + %0 = arith.addi %x, %c128_i32_0 : i32 loc(#loc228) + %c1_i32 = arith.constant 1 : i32 loc(#loc229) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc229) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc229) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc230) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc230) + %2 = arith.divsi %1, %c128_i32_3 : i32 loc(#loc230) + tt.return %2 : i32 loc(#loc231) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc232) + tt.return %3 : i32 loc(#loc232) + } loc(#loc227) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc234) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc234) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc235) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc236) + tt.return %0 : tensor<128x128xf32> loc(#loc236) + } loc(#loc233) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc237)), %offs_m: tensor<128xi32> loc("offs_m"(#loc237)), %offs_n: tensor<128xi32> loc("offs_n"(#loc237)), %stride_m: i32 loc("stride_m"(#loc237)), %stride_n: i32 loc("stride_n"(#loc237)), %M_LEN: i32 loc("M_LEN"(#loc237))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc703) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc703) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc704) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc704) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc705) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc706) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc706) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc707) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc707) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc707) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc244) + %1 = tt.splat %M_LEN : i32 -> tensor<128x1xi32> loc(#loc245) + %2 = arith.cmpi slt, %0, %1 : tensor<128x1xi32> loc(#loc245) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc246) + %3 = tt.broadcast %2 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc246) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc246) + %4 = arith.truncf %cst_11 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc246) + %5 = tt.load %ptr_10, %3, %4 : tensor<128x128x!tt.ptr> loc(#loc246) + tt.return %5 : tensor<128x128xbf16> loc(#loc247) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x128xbf16> loc(#loc248) + tt.return %6 : tensor<128x128xbf16> loc(#loc248) + } loc(#loc237) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc249)), %arg_K: !tt.ptr loc("arg_K"(#loc249)), %arg_V: !tt.ptr loc("arg_V"(#loc249)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc249)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc249)), %arg_DO: !tt.ptr loc("arg_DO"(#loc249)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc249)), %arg_DV: !tt.ptr loc("arg_DV"(#loc249)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc249)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc249)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc249)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc249)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc249)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc249)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc249)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc249)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc249)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc249)), %ks0: i32 loc("ks0"(#loc249)), %ks1: i32 loc("ks1"(#loc249)), %ks2: i32 loc("ks2"(#loc249)), %ks3: i32 loc("ks3"(#loc249)), %ks4: i32 loc("ks4"(#loc249)), %ks5: i32 loc("ks5"(#loc249)), %ks6: i32 loc("ks6"(#loc249)), %ks7: i32 loc("ks7"(#loc249)), %ks8: i32 loc("ks8"(#loc249)), %K: !tt.ptr loc("K"(#loc249)), %V: !tt.ptr loc("V"(#loc249)), %dq: tensor<128x128xf32> loc("dq"(#loc249)), %q: tensor<128x128xbf16> loc("q"(#loc249)), %do: tensor<128x128xbf16> loc("do"(#loc249)), %Di: tensor<128xf32> loc("Di"(#loc249)), %lse: tensor<128x1xf32> loc("lse"(#loc249)), %off_z: i32 loc("off_z"(#loc249)), %off_hq: i32 loc("off_hq"(#loc249)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc249)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc249)), %stride_kn: i32 loc("stride_kn"(#loc249)), %stride_kd: i32 loc("stride_kd"(#loc249)), %stride_vn: i32 loc("stride_vn"(#loc249)), %stride_vd: i32 loc("stride_vd"(#loc249)), %kv_indices: !tt.ptr loc("kv_indices"(#loc249)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc249))) -> tensor<128x128xf32> attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc752) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc753) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc754) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc755) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc755) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc756) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc756) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc757) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc758) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc758) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc759) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc759) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc759) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc760) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc761) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc761) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc762) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc762) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc763) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc764) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc764) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc765) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc765) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc765) + %hi = arith.constant 2 : i32 loc(#loc766) + %hi_20 = arith.constant 2 : i32 loc(#loc766) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc766) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks1) : (i32) -> i32 loc(#loc767) + %hi_23 = arith.constant 1 : i32 loc(#loc768) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc768) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc769) + %c0_i32 = arith.constant 0 : i32 loc(#loc268) + %c1_i32 = arith.constant 1 : i32 loc(#loc268) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc268) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc268) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc268) + %3 = ub.poison : i32 loc(#loc268) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %ks0, %ks1, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc771) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc772) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc773) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc774) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc774) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc775) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc776) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc776) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc777) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc777) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc276) + } loc(#loc1120) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc277) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc278) + tt.return %4 : tensor<128x128xf32> loc(#loc278) + } loc(#loc249) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc227))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc228) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc228) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc228) + %c1_i32 = arith.constant 1 : i32 loc(#loc229) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc229) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc229) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc230) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc230) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc230) + tt.return %2 : i32 loc(#loc231) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc232) + tt.return %3 : i32 loc(#loc232) + } loc(#loc227) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc279)), %arg_K: !tt.ptr loc("arg_K"(#loc279)), %arg_V: !tt.ptr loc("arg_V"(#loc279)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc279)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc279)), %arg_DO: !tt.ptr loc("arg_DO"(#loc279)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc279)), %arg_DV: !tt.ptr loc("arg_DV"(#loc279)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc279)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc279)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc279)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc279)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc279)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc279)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc279)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc279)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc279)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc279)), %ks0: i32 loc("ks0"(#loc279)), %ks1: i32 loc("ks1"(#loc279)), %ks2: i32 loc("ks2"(#loc279)), %ks3: i32 loc("ks3"(#loc279)), %ks4: i32 loc("ks4"(#loc279)), %ks5: i32 loc("ks5"(#loc279)), %ks6: i32 loc("ks6"(#loc279)), %ks7: i32 loc("ks7"(#loc279)), %ks8: i32 loc("ks8"(#loc279)), %dq: tensor<128x128xf32> loc("dq"(#loc279)), %q: tensor<128x128xbf16> loc("q"(#loc279)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc279)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc279)), %do: tensor<128x128xbf16> loc("do"(#loc279)), %Di: tensor<128xf32> loc("Di"(#loc279)), %lse: tensor<128x1xf32> loc("lse"(#loc279)), %Q_LEN: i32 loc("Q_LEN"(#loc279)), %KV_LEN: i32 loc("KV_LEN"(#loc279)), %off_z: i32 loc("off_z"(#loc279)), %off_hq: i32 loc("off_hq"(#loc279)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc279)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc279)), %offs_k: tensor<128xi32> loc("offs_k"(#loc279)), %offs_v: tensor<128xi32> loc("offs_v"(#loc279)), %stride_kn: i32 loc("stride_kn"(#loc279)), %stride_kd: i32 loc("stride_kd"(#loc279)), %stride_vn: i32 loc("stride_vn"(#loc279)), %stride_vd: i32 loc("stride_vd"(#loc279)), %kv_indices: !tt.ptr loc("kv_indices"(#loc279)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc279))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc826) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc827) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc827) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc827) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc828) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc828) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc829) + %n_6 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S1_64S_i32__(%n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc830) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc831) + %m_7 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S128_1S_i32__(%m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc832) + %post_mod_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc833) + %post_mod_scores_8 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc834) + %post_mod_scores_9 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_8 : tensor<1x64xi32> loc(#loc834) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc835) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc835) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_5, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc835) + %tmp1 = arith.constant false loc(#loc836) + %tmp1_15 = arith.constant dense : tensor<1xi1> loc(#loc836) + %tmp4 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc837) + %tmp4_16 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc837) + %tmp4_17 = arith.cmpi sge, %tmp4, %tmp4_16 : tensor<128x64xi32> loc(#loc837) + %tmp5 = arith.extsi %n_6 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc838) + %tmp7 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc839) + %tmp7_18 = tt.load %tmp7 : !tt.ptr loc(#loc840) + %tmp8 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc841) + %tmp8_19 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc841) + %tmp9 = arith.extsi %m_7 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc842) + %tmp10 = tt.splat %tmp7_18 : i64 -> tensor<128x1xi64> loc(#loc843) + %tmp10_20 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc843) + %tmp11 = tt.broadcast %tmp8_19 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc844) + %tmp11_21 = tt.broadcast %tmp10_20 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc844) + %tmp11_22 = arith.andi %tmp11, %tmp11_21 : tensor<128x64xi1> loc(#loc844) + %tmp12 = arith.andi %tmp4_17, %tmp11_22 : tensor<128x64xi1> loc(#loc845) + %tmp13 = tt.expand_dims %tmp1_15 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc846) + %tmp13_23 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc846) + %tmp13_24 = arith.ori %tmp13_23, %tmp12 : tensor<128x64xi1> loc(#loc846) + %tmp15 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc847) + %tmp15_25 = arith.cmpi sge, %n_6, %tmp15 : tensor<1x64xi32> loc(#loc847) + %tmp16 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc848) + %tmp16_26 = arith.remsi %n_6, %tmp16 : tensor<1x64xi32> loc(#loc848) + %tmp17 = arith.constant 0 : i32 loc(#loc849) + %tmp17_27 = arith.constant dense<0> : tensor<1xi32> loc(#loc849) + %tmp18 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc850) + %tmp18_28 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc850) + %tmp18_29 = arith.cmpi ne, %tmp16_26, %tmp18_28 : tensor<1x64xi32> loc(#loc850) + %tmp19 = arith.constant 0 : i32 loc(#loc851) + %tmp19_30 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc851) + %tmp19_31 = arith.cmpi slt, %tmp16_26, %tmp19_30 : tensor<1x64xi32> loc(#loc851) + %tmp20 = arith.constant 0 : i32 loc(#loc852) + %tmp20_32 = arith.cmpi slt, %ks8, %tmp20 : i32 loc(#loc852) + %tmp21 = tt.splat %tmp20_32 : i1 -> tensor<1x64xi1> loc(#loc853) + %tmp21_33 = arith.cmpi ne, %tmp19_31, %tmp21 : tensor<1x64xi1> loc(#loc853) + %tmp22 = arith.andi %tmp18_29, %tmp21_33 : tensor<1x64xi1> loc(#loc854) + %tmp23 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc855) + %tmp23_34 = arith.addi %tmp16_26, %tmp23 : tensor<1x64xi32> loc(#loc855) + %tmp24 = arith.select %tmp22, %tmp23_34, %tmp16_26 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc856) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc857) + %tmp26 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc858) + %tmp26_35 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc858) + %tmp27 = arith.andi %tmp15_25, %tmp26_35 : tensor<1x64xi1> loc(#loc859) + %tmp28 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc860) + %tmp28_36 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc860) + %tmp28_37 = arith.subi %tmp28, %tmp28_36 : tensor<128x64xi32> loc(#loc860) + %tmp29 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc861) + %tmp29_38 = arith.remsi %tmp28_37, %tmp29 : tensor<128x64xi32> loc(#loc861) + %tmp30 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc862) + %tmp30_39 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc862) + %tmp30_40 = arith.cmpi ne, %tmp29_38, %tmp30_39 : tensor<128x64xi32> loc(#loc862) + %tmp31 = arith.constant 0 : i32 loc(#loc863) + %tmp31_41 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc863) + %tmp31_42 = arith.cmpi slt, %tmp29_38, %tmp31_41 : tensor<128x64xi32> loc(#loc863) + %tmp32 = tt.splat %tmp20_32 : i1 -> tensor<128x64xi1> loc(#loc864) + %tmp32_43 = arith.cmpi ne, %tmp31_42, %tmp32 : tensor<128x64xi1> loc(#loc864) + %tmp33 = arith.andi %tmp30_40, %tmp32_43 : tensor<128x64xi1> loc(#loc865) + %tmp34 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc866) + %tmp34_44 = arith.addi %tmp29_38, %tmp34 : tensor<128x64xi32> loc(#loc866) + %tmp35 = arith.select %tmp33, %tmp34_44, %tmp29_38 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc867) + %tmp36 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc868) + %tmp36_45 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc868) + %tmp36_46 = arith.cmpi eq, %tmp35, %tmp36_45 : tensor<128x64xi32> loc(#loc868) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc869) + %tmp37_47 = arith.andi %tmp37, %tmp36_46 : tensor<128x64xi1> loc(#loc869) + %tmp38 = arith.ori %tmp13_24, %tmp37_47 : tensor<128x64xi1> loc(#loc870) + %post_mod_scores_48 = arith.constant 0xFF800000 : f32 loc(#loc871) + %post_mod_scores_49 = arith.constant 0xFF800000 : f32 loc(#loc871) + %post_mod_scores_50 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc871) + %post_mod_scores_51 = arith.select %tmp38, %post_mod_scores_14, %post_mod_scores_50 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc871) + %post_mod_scores_52 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_53 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_54 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc872) + %post_mod_scores_55 = arith.mulf %post_mod_scores_51, %post_mod_scores_54 : tensor<128x64xf32> loc(#loc872) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc873) + %p_56 = arith.subf %post_mod_scores_55, %p : tensor<128x64xf32> loc(#loc873) + %p_57 = math.exp2 %p_56 : tensor<128x64xf32> loc(#loc874) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc875) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc876) + %dp_58 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc876) + %dp_59 = tt.dot %do, %vT, %dp_58, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc876) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc877) + %ds_60 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc878) + %ds_61 = arith.subf %dp_59, %ds_60 : tensor<128x64xf32> loc(#loc878) + %ds_62 = arith.mulf %p_57, %ds_61 : tensor<128x64xf32> loc(#loc879) + %grad_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc880) + %grad_scores_63 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc881) + %grad_scores_64 = arith.cmpi slt, %grad_scores, %grad_scores_63 : tensor<1x64xi32> loc(#loc881) + %grad_scores_65 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_66 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_67 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc882) + %grad_scores_68 = tt.broadcast %grad_scores_64 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc882) + %grad_scores_69 = arith.select %grad_scores_68, %ds_62, %grad_scores_67 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc882) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc883) + %scatter_mask_70 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc884) + %scatter_mask_71 = arith.cmpi slt, %scatter_mask, %scatter_mask_70 : tensor<128x1xi32> loc(#loc884) + %scatter_mask_72 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc885) + %scatter_mask_73 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc886) + %scatter_mask_74 = arith.cmpi slt, %scatter_mask_72, %scatter_mask_73 : tensor<1x64xi32> loc(#loc886) + %scatter_mask_75 = tt.broadcast %scatter_mask_71 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_76 = tt.broadcast %scatter_mask_74 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_77 = arith.andi %scatter_mask_75, %scatter_mask_76 : tensor<128x64xi1> loc(#loc887) + %ds_78 = arith.constant 0.000000e+00 : f32 loc(#loc888) + %ds_79 = arith.constant 0.000000e+00 : f32 loc(#loc888) + %ds_80 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc888) + %ds_81 = arith.select %tmp38, %grad_scores_69, %ds_80 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc888) + %ds_82 = arith.truncf %ds_81 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc889) + %dq_83 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc890) + %dq_84 = arith.constant 0.000000e+00 : f32 loc(#loc891) + %dq_85 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc891) + %dq_86 = tt.dot %ds_82, %dq_83, %dq_85, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc891) + %dq_87 = arith.addf %dq, %dq_86 : tensor<128x128xf32> loc(#loc892) + tt.return %dq_87 : tensor<128x128xf32> loc(#loc347) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc348) + tt.return %0 : tensor<128x128xf32> loc(#loc348) + } loc(#loc279) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%ptr: tensor<128x64x!tt.ptr> loc("ptr"(#loc237)), %offs_m: tensor<128xi32> loc("offs_m"(#loc237)), %offs_n: tensor<64xi32> loc("offs_n"(#loc237)), %N_LEN: i32 loc("N_LEN"(#loc237))) -> tensor<128x64xbf16> attributes {noinline = false} { + %0 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc349) + %1 = tt.splat %N_LEN : i32 -> tensor<1x64xi32> loc(#loc350) + %2 = arith.cmpi slt, %0, %1 : tensor<1x64xi32> loc(#loc350) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc351) + %3 = tt.broadcast %2 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc351) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc351) + %4 = arith.truncf %cst_0 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc351) + %5 = tt.load %ptr, %3, %4 : tensor<128x64x!tt.ptr> loc(#loc351) + tt.return %5 : tensor<128x64xbf16> loc(#loc352) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x64xbf16> loc(#loc248) + tt.return %6 : tensor<128x64xbf16> loc(#loc248) + } loc(#loc237) + tt.func private @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S1_64S_i32__(%indices: tensor<1x64xi32> loc("indices"(#loc353)), %max_len: i32 loc("max_len"(#loc353))) -> tensor<1x64xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<1x64xi32> loc(#loc354) + %1 = arith.remsi %indices, %0 : tensor<1x64xi32> loc(#loc354) + tt.return %1 : tensor<1x64xi32> loc(#loc355) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x64xi32> loc(#loc356) + tt.return %2 : tensor<1x64xi32> loc(#loc356) + } loc(#loc353) + tt.func private @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S128_1S_i32__(%indices: tensor<128x1xi32> loc("indices"(#loc353)), %max_len: i32 loc("max_len"(#loc353))) -> tensor<128x1xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<128x1xi32> loc(#loc354) + %1 = arith.remsi %indices, %0 : tensor<128x1xi32> loc(#loc354) + tt.return %1 : tensor<128x1xi32> loc(#loc355) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x1xi32> loc(#loc356) + tt.return %2 : tensor<128x1xi32> loc(#loc356) + } loc(#loc353) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc357)), %col_indices: !tt.ptr loc("col_indices"(#loc357)), %total_blocks: i32 loc("total_blocks"(#loc357))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc899) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc899) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc899) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc900) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc901) + %next_block = arith.constant 1 : i32 loc(#loc902) + %next_block_3 = arith.constant 1 : i32 loc(#loc902) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc902) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc903) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc904) + %next_block_7 = arith.constant 1 : i32 loc(#loc905) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc905) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc906) + %needs_jump = arith.constant 1 : i32 loc(#loc907) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc907) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc907) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc908) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc908) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc908) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc909) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc909) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc910) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc911) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc911) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc911) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc912) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc912) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc912) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc913) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc913) + %offset_24 = arith.constant 1 : i32 loc(#loc914) + %offset_25 = arith.constant 1 : i32 loc(#loc914) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc914) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc914) + %offset_28 = arith.constant 64 : i32 loc(#loc915) + %offset_29 = arith.constant 64 : i32 loc(#loc915) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc915) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc916) + tt.return %offset_31 : i32 loc(#loc376) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc377) + tt.return %0 : i32 loc(#loc377) + } loc(#loc357) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc249)), %arg_K: !tt.ptr loc("arg_K"(#loc249)), %arg_V: !tt.ptr loc("arg_V"(#loc249)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc249)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc249)), %arg_DO: !tt.ptr loc("arg_DO"(#loc249)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc249)), %arg_DV: !tt.ptr loc("arg_DV"(#loc249)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc249)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc249)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc249)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc249)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc249)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc249)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc249)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc249)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc249)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc249)), %ks0: i32 loc("ks0"(#loc249)), %ks1: i32 loc("ks1"(#loc249)), %ks2: i32 loc("ks2"(#loc249)), %ks3: i32 loc("ks3"(#loc249)), %ks4: i32 loc("ks4"(#loc249)), %ks5: i32 loc("ks5"(#loc249)), %ks6: i32 loc("ks6"(#loc249)), %ks7: i32 loc("ks7"(#loc249)), %ks8: i32 loc("ks8"(#loc249)), %K: !tt.ptr loc("K"(#loc249)), %V: !tt.ptr loc("V"(#loc249)), %dq: tensor<128x128xf32> loc("dq"(#loc249)), %q: tensor<128x128xbf16> loc("q"(#loc249)), %do: tensor<128x128xbf16> loc("do"(#loc249)), %Di: tensor<128xf32> loc("Di"(#loc249)), %lse: tensor<128x1xf32> loc("lse"(#loc249)), %off_z: i32 loc("off_z"(#loc249)), %off_hq: i32 loc("off_hq"(#loc249)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc249)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc249)), %stride_kn: i32 loc("stride_kn"(#loc249)), %stride_kd: i32 loc("stride_kd"(#loc249)), %stride_vn: i32 loc("stride_vn"(#loc249)), %stride_vd: i32 loc("stride_vd"(#loc249)), %kv_indices: !tt.ptr loc("kv_indices"(#loc249)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc249))) -> tensor<128x128xf32> attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc752) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc753) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc754) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc755) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc755) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc756) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc756) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc757) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc758) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc758) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc759) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc759) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc759) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc760) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc761) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc761) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc762) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc762) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc763) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc764) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc764) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc765) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc765) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc765) + %hi = arith.constant 2 : i32 loc(#loc766) + %hi_20 = arith.constant 2 : i32 loc(#loc766) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc766) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks1) : (i32) -> i32 loc(#loc767) + %hi_23 = arith.constant 1 : i32 loc(#loc768) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc768) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc769) + %c0_i32 = arith.constant 0 : i32 loc(#loc268) + %c1_i32 = arith.constant 1 : i32 loc(#loc268) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc268) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc268) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc268) + %3 = ub.poison : i32 loc(#loc268) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %ks0, %ks1, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc771) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc772) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc773) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc774) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc774) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc775) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc776) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc776) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc777) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc777) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc276) + } loc(#loc1120) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc277) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc278) + tt.return %4 : tensor<128x128xf32> loc(#loc278) + } loc(#loc249) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc279)), %arg_K: !tt.ptr loc("arg_K"(#loc279)), %arg_V: !tt.ptr loc("arg_V"(#loc279)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc279)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc279)), %arg_DO: !tt.ptr loc("arg_DO"(#loc279)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc279)), %arg_DV: !tt.ptr loc("arg_DV"(#loc279)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc279)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc279)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc279)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc279)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc279)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc279)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc279)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc279)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc279)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc279)), %ks0: i32 loc("ks0"(#loc279)), %ks1: i32 loc("ks1"(#loc279)), %ks2: i32 loc("ks2"(#loc279)), %ks3: i32 loc("ks3"(#loc279)), %ks4: i32 loc("ks4"(#loc279)), %ks5: i32 loc("ks5"(#loc279)), %ks6: i32 loc("ks6"(#loc279)), %ks7: i32 loc("ks7"(#loc279)), %ks8: i32 loc("ks8"(#loc279)), %dq: tensor<128x128xf32> loc("dq"(#loc279)), %q: tensor<128x128xbf16> loc("q"(#loc279)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc279)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc279)), %do: tensor<128x128xbf16> loc("do"(#loc279)), %Di: tensor<128xf32> loc("Di"(#loc279)), %lse: tensor<128x1xf32> loc("lse"(#loc279)), %Q_LEN: i32 loc("Q_LEN"(#loc279)), %KV_LEN: i32 loc("KV_LEN"(#loc279)), %off_z: i32 loc("off_z"(#loc279)), %off_hq: i32 loc("off_hq"(#loc279)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc279)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc279)), %offs_k: tensor<128xi32> loc("offs_k"(#loc279)), %offs_v: tensor<128xi32> loc("offs_v"(#loc279)), %stride_kn: i32 loc("stride_kn"(#loc279)), %stride_kd: i32 loc("stride_kd"(#loc279)), %stride_vn: i32 loc("stride_vn"(#loc279)), %stride_vd: i32 loc("stride_vd"(#loc279)), %kv_indices: !tt.ptr loc("kv_indices"(#loc279)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc279))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc826) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc827) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc827) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc827) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc828) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc828) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc829) + %n_6 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S1_64S_i32__(%n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc830) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc831) + %m_7 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S128_1S_i32__(%m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc832) + %post_mod_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc833) + %post_mod_scores_8 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc834) + %post_mod_scores_9 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_8 : tensor<1x64xi32> loc(#loc834) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc835) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc835) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_5, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc835) + %post_mod_scores_15 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_16 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_17 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc872) + %post_mod_scores_18 = arith.mulf %post_mod_scores_14, %post_mod_scores_17 : tensor<128x64xf32> loc(#loc872) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc873) + %p_19 = arith.subf %post_mod_scores_18, %p : tensor<128x64xf32> loc(#loc873) + %p_20 = math.exp2 %p_19 : tensor<128x64xf32> loc(#loc874) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc875) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc876) + %dp_21 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc876) + %dp_22 = tt.dot %do, %vT, %dp_21, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc876) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc877) + %ds_23 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc878) + %ds_24 = arith.subf %dp_22, %ds_23 : tensor<128x64xf32> loc(#loc878) + %ds_25 = arith.mulf %p_20, %ds_24 : tensor<128x64xf32> loc(#loc879) + %grad_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc880) + %grad_scores_26 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc881) + %grad_scores_27 = arith.cmpi slt, %grad_scores, %grad_scores_26 : tensor<1x64xi32> loc(#loc881) + %grad_scores_28 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_29 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_30 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc882) + %grad_scores_31 = tt.broadcast %grad_scores_27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc882) + %grad_scores_32 = arith.select %grad_scores_31, %ds_25, %grad_scores_30 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc882) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc883) + %scatter_mask_33 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc884) + %scatter_mask_34 = arith.cmpi slt, %scatter_mask, %scatter_mask_33 : tensor<128x1xi32> loc(#loc884) + %scatter_mask_35 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc885) + %scatter_mask_36 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc886) + %scatter_mask_37 = arith.cmpi slt, %scatter_mask_35, %scatter_mask_36 : tensor<1x64xi32> loc(#loc886) + %scatter_mask_38 = tt.broadcast %scatter_mask_34 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_39 = tt.broadcast %scatter_mask_37 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_40 = arith.andi %scatter_mask_38, %scatter_mask_39 : tensor<128x64xi1> loc(#loc887) + %ds_41 = arith.truncf %grad_scores_32 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc889) + %dq_42 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc890) + %dq_43 = arith.constant 0.000000e+00 : f32 loc(#loc891) + %dq_44 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc891) + %dq_45 = tt.dot %ds_41, %dq_42, %dq_44, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc891) + %dq_46 = arith.addf %dq, %dq_45 : tensor<128x128xf32> loc(#loc892) + tt.return %dq_46 : tensor<128x128xf32> loc(#loc347) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc348) + tt.return %0 : tensor<128x128xf32> loc(#loc348) + } loc(#loc279) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc378)), %arg_K: !tt.ptr loc("arg_K"(#loc378)), %arg_V: !tt.ptr loc("arg_V"(#loc378)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc378)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc378)), %arg_DO: !tt.ptr loc("arg_DO"(#loc378)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc378)), %arg_DV: !tt.ptr loc("arg_DV"(#loc378)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc378)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc378)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc378)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc378)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc378)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc378)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc378)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc378)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc378)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc378)), %ks0: i32 loc("ks0"(#loc378)), %ks1: i32 loc("ks1"(#loc378)), %ks2: i32 loc("ks2"(#loc378)), %ks3: i32 loc("ks3"(#loc378)), %ks4: i32 loc("ks4"(#loc378)), %ks5: i32 loc("ks5"(#loc378)), %ks6: i32 loc("ks6"(#loc378)), %ks7: i32 loc("ks7"(#loc378)), %ks8: i32 loc("ks8"(#loc378)), %Q: !tt.ptr loc("Q"(#loc378)), %DO: !tt.ptr loc("DO"(#loc378)), %DELTA: !tt.ptr loc("DELTA"(#loc378)), %LSE: !tt.ptr loc("LSE"(#loc378)), %dk: tensor<128x128xf32> loc("dk"(#loc378)), %dv: tensor<128x128xf32> loc("dv"(#loc378)), %k: tensor<128x128xbf16> loc("k"(#loc378)), %v: tensor<128x128xbf16> loc("v"(#loc378)), %off_z: i32 loc("off_z"(#loc378)), %off_hq: i32 loc("off_hq"(#loc378)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc378)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc378)), %stride_qm: i32 loc("stride_qm"(#loc378)), %stride_qd: i32 loc("stride_qd"(#loc378)), %stride_dom: i32 loc("stride_dom"(#loc378)), %stride_dod: i32 loc("stride_dod"(#loc378)), %q_indices: !tt.ptr loc("q_indices"(#loc378)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc378))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc962) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc963) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc964) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc965) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc965) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc966) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc966) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc967) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc968) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc968) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc969) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc969) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc969) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc970) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc971) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc971) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc972) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc972) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc973) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc974) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc974) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc975) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc975) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc975) + %hi = arith.constant 2 : i32 loc(#loc976) + %hi_20 = arith.constant 2 : i32 loc(#loc976) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc976) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc977) + %hi_23 = arith.constant 1 : i32 loc(#loc978) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc978) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc979) + %c0_i32 = arith.constant 0 : i32 loc(#loc397) + %c1_i32 = arith.constant 1 : i32 loc(#loc397) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc397) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc397) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc397) + %3 = ub.poison : i32 loc(#loc397) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %ks0, %ks1, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc398) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc981) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc982) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc983) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc983) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc984) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc985) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc985) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc986) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc986) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc405) + } loc(#loc1122) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc406) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc407) + %5 = ub.poison : tensor<128x128xf32> loc(#loc407) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc407) + } loc(#loc378) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc408)), %arg_K: !tt.ptr loc("arg_K"(#loc408)), %arg_V: !tt.ptr loc("arg_V"(#loc408)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc408)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc408)), %arg_DO: !tt.ptr loc("arg_DO"(#loc408)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc408)), %arg_DV: !tt.ptr loc("arg_DV"(#loc408)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc408)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc408)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc408)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc408)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc408)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc408)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc408)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc408)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc408)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc408)), %ks0: i32 loc("ks0"(#loc408)), %ks1: i32 loc("ks1"(#loc408)), %ks2: i32 loc("ks2"(#loc408)), %ks3: i32 loc("ks3"(#loc408)), %ks4: i32 loc("ks4"(#loc408)), %ks5: i32 loc("ks5"(#loc408)), %ks6: i32 loc("ks6"(#loc408)), %ks7: i32 loc("ks7"(#loc408)), %ks8: i32 loc("ks8"(#loc408)), %dk: tensor<128x128xf32> loc("dk"(#loc408)), %dv: tensor<128x128xf32> loc("dv"(#loc408)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc408)), %k: tensor<128x128xbf16> loc("k"(#loc408)), %v: tensor<128x128xbf16> loc("v"(#loc408)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc408)), %DELTA: !tt.ptr loc("DELTA"(#loc408)), %LSE: !tt.ptr loc("LSE"(#loc408)), %Q_LEN: i32 loc("Q_LEN"(#loc408)), %KV_LEN: i32 loc("KV_LEN"(#loc408)), %off_z: i32 loc("off_z"(#loc408)), %off_hq: i32 loc("off_hq"(#loc408)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc408)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc408)), %offs_k: tensor<128xi32> loc("offs_k"(#loc408)), %offs_v: tensor<128xi32> loc("offs_v"(#loc408)), %stride_qm: i32 loc("stride_qm"(#loc408)), %stride_qd: i32 loc("stride_qd"(#loc408)), %stride_dom: i32 loc("stride_dom"(#loc408)), %stride_dod: i32 loc("stride_dod"(#loc408)), %q_indices: !tt.ptr loc("q_indices"(#loc408)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc408))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc1036) + %lse = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1037) + %lse_0 = arith.cmpi slt, %offs_m1, %lse : tensor<64xi32> loc(#loc1037) + %lse_1 = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1038) + %lse_2 = tt.addptr %lse_1, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1038) + %lse_3 = tt.load %lse_2, %lse_0 : tensor<64x!tt.ptr> loc(#loc1039) + %lse_4 = arith.constant 0xFF800000 : f32 loc(#loc1040) + %lse_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1040) + %lse_6 = arith.cmpf oeq, %lse_3, %lse_5 : tensor<64xf32> loc(#loc1040) + %lse_7 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_8 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_9 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1041) + %lse_10 = arith.select %lse_6, %lse_9, %lse_3 : tensor<64xi1>, tensor<64xf32> loc(#loc1041) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc1042) + %qkT_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1042) + %qkT_12 = tt.dot %k, %qT, %qkT_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1042) + %qkT_13 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_14 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1043) + %qkT_16 = arith.mulf %qkT_12, %qkT_15 : tensor<128x64xf32> loc(#loc1043) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1044) + %m_17 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S1_64S_i32__(%m, %Q_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc1045) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc1046) + %n_18 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S128_1S_i32__(%n, %KV_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc1047) + %post_mod_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1048) + %post_mod_scores_19 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_20 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_19 : tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_21 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_22 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_23 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1050) + %post_mod_scores_24 = tt.broadcast %post_mod_scores_20 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1050) + %post_mod_scores_25 = arith.select %post_mod_scores_24, %qkT_16, %post_mod_scores_23 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1050) + %tmp41 = arith.constant false loc(#loc1051) + %tmp41_26 = arith.constant dense : tensor<1xi1> loc(#loc1051) + %tmp44 = tt.broadcast %m_17 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc1052) + %tmp44_27 = tt.broadcast %n_18 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc1052) + %tmp44_28 = arith.cmpi sge, %tmp44, %tmp44_27 : tensor<128x64xi32> loc(#loc1052) + %tmp45 = arith.extsi %n_18 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc1053) + %tmp47 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc1054) + %tmp47_29 = tt.load %tmp47 : !tt.ptr loc(#loc1055) + %tmp48 = tt.splat %tmp47_29 : i64 -> tensor<128x1xi64> loc(#loc1056) + %tmp48_30 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc1056) + %tmp49 = arith.extsi %m_17 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc1057) + %tmp50 = tt.splat %tmp47_29 : i64 -> tensor<1x64xi64> loc(#loc1058) + %tmp50_31 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc1058) + %tmp51 = tt.broadcast %tmp48_30 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc1059) + %tmp51_32 = tt.broadcast %tmp50_31 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1059) + %tmp51_33 = arith.andi %tmp51, %tmp51_32 : tensor<128x64xi1> loc(#loc1059) + %tmp52 = arith.andi %tmp44_28, %tmp51_33 : tensor<128x64xi1> loc(#loc1060) + %tmp53 = tt.expand_dims %tmp41_26 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc1061) + %tmp53_34 = tt.broadcast %tmp53 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc1061) + %tmp53_35 = arith.ori %tmp53_34, %tmp52 : tensor<128x64xi1> loc(#loc1061) + %tmp55 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc1062) + %tmp55_36 = arith.cmpi sge, %n_18, %tmp55 : tensor<128x1xi32> loc(#loc1062) + %tmp56 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc1063) + %tmp56_37 = arith.remsi %n_18, %tmp56 : tensor<128x1xi32> loc(#loc1063) + %tmp57 = arith.constant 0 : i32 loc(#loc1064) + %tmp57_38 = arith.constant dense<0> : tensor<1xi32> loc(#loc1064) + %tmp58 = tt.expand_dims %tmp57_38 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1065) + %tmp58_39 = tt.broadcast %tmp58 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc1065) + %tmp58_40 = arith.cmpi ne, %tmp56_37, %tmp58_39 : tensor<128x1xi32> loc(#loc1065) + %tmp59 = arith.constant 0 : i32 loc(#loc1066) + %tmp59_41 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1066) + %tmp59_42 = arith.cmpi slt, %tmp56_37, %tmp59_41 : tensor<128x1xi32> loc(#loc1066) + %tmp60 = arith.constant 0 : i32 loc(#loc1067) + %tmp60_43 = arith.cmpi slt, %ks8, %tmp60 : i32 loc(#loc1067) + %tmp61 = tt.splat %tmp60_43 : i1 -> tensor<128x1xi1> loc(#loc1068) + %tmp61_44 = arith.cmpi ne, %tmp59_42, %tmp61 : tensor<128x1xi1> loc(#loc1068) + %tmp62 = arith.andi %tmp58_40, %tmp61_44 : tensor<128x1xi1> loc(#loc1069) + %tmp63 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc1070) + %tmp63_45 = arith.addi %tmp56_37, %tmp63 : tensor<128x1xi32> loc(#loc1070) + %tmp64 = arith.select %tmp62, %tmp63_45, %tmp56_37 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc1071) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc1072) + %tmp66 = tt.splat %tmp47_29 : i64 -> tensor<128x1xi64> loc(#loc1073) + %tmp66_46 = arith.cmpi slt, %tmp65, %tmp66 : tensor<128x1xi64> loc(#loc1073) + %tmp67 = arith.andi %tmp55_36, %tmp66_46 : tensor<128x1xi1> loc(#loc1074) + %tmp68 = tt.broadcast %n_18 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc1075) + %tmp68_47 = tt.broadcast %m_17 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc1075) + %tmp68_48 = arith.subi %tmp68, %tmp68_47 : tensor<128x64xi32> loc(#loc1075) + %tmp69 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc1076) + %tmp69_49 = arith.remsi %tmp68_48, %tmp69 : tensor<128x64xi32> loc(#loc1076) + %tmp70 = tt.expand_dims %tmp57_38 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1077) + %tmp70_50 = tt.broadcast %tmp70 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1077) + %tmp70_51 = arith.cmpi ne, %tmp69_49, %tmp70_50 : tensor<128x64xi32> loc(#loc1077) + %tmp71 = arith.constant 0 : i32 loc(#loc1078) + %tmp71_52 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1078) + %tmp71_53 = arith.cmpi slt, %tmp69_49, %tmp71_52 : tensor<128x64xi32> loc(#loc1078) + %tmp72 = tt.splat %tmp60_43 : i1 -> tensor<128x64xi1> loc(#loc1079) + %tmp72_54 = arith.cmpi ne, %tmp71_53, %tmp72 : tensor<128x64xi1> loc(#loc1079) + %tmp73 = arith.andi %tmp70_51, %tmp72_54 : tensor<128x64xi1> loc(#loc1080) + %tmp74 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc1081) + %tmp74_55 = arith.addi %tmp69_49, %tmp74 : tensor<128x64xi32> loc(#loc1081) + %tmp75 = arith.select %tmp73, %tmp74_55, %tmp69_49 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc1082) + %tmp76 = tt.expand_dims %tmp57_38 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1083) + %tmp76_56 = tt.broadcast %tmp76 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1083) + %tmp76_57 = arith.cmpi eq, %tmp75, %tmp76_56 : tensor<128x64xi32> loc(#loc1083) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc1084) + %tmp77_58 = arith.andi %tmp77, %tmp76_57 : tensor<128x64xi1> loc(#loc1084) + %tmp78 = arith.ori %tmp53_35, %tmp77_58 : tensor<128x64xi1> loc(#loc1085) + %post_mod_scores_59 = arith.constant 0xFF800000 : f32 loc(#loc1086) + %post_mod_scores_60 = arith.constant 0xFF800000 : f32 loc(#loc1086) + %post_mod_scores_61 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1086) + %post_mod_scores_62 = arith.select %tmp78, %post_mod_scores_25, %post_mod_scores_61 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1086) + %post_mod_scores_63 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_64 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_65 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1087) + %post_mod_scores_66 = arith.mulf %post_mod_scores_62, %post_mod_scores_65 : tensor<128x64xf32> loc(#loc1087) + %pT = tt.expand_dims %lse_10 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1088) + %pT_67 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1089) + %pT_68 = arith.subf %post_mod_scores_66, %pT_67 : tensor<128x64xf32> loc(#loc1089) + %pT_69 = math.exp2 %pT_68 : tensor<128x64xf32> loc(#loc1090) + %do = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1091) + %dv_70 = arith.truncf %pT_69 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1092) + %dv_71 = arith.constant 0.000000e+00 : f32 loc(#loc1093) + %dv_72 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1093) + %dv_73 = tt.dot %dv_70, %do, %dv_72, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1093) + %dv_74 = arith.addf %dv, %dv_73 : tensor<128x128xf32> loc(#loc1094) + %Di = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1095) + %Di_75 = arith.cmpi slt, %offs_m1, %Di : tensor<64xi32> loc(#loc1095) + %Di_76 = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1096) + %Di_77 = tt.addptr %Di_76, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1096) + %Di_78 = tt.load %Di_77, %Di_75 : tensor<64x!tt.ptr> loc(#loc1097) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1098) + %dpT_79 = arith.constant 0.000000e+00 : f32 loc(#loc1099) + %dpT_80 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1099) + %dpT_81 = tt.dot %v, %dpT, %dpT_80, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1099) + %dsT = tt.expand_dims %Di_78 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1100) + %dsT_82 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1101) + %dsT_83 = arith.subf %dpT_81, %dsT_82 : tensor<128x64xf32> loc(#loc1101) + %dsT_84 = arith.mulf %pT_69, %dsT_83 : tensor<128x64xf32> loc(#loc1102) + %grad_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1103) + %grad_scores_85 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1104) + %grad_scores_86 = arith.cmpi slt, %grad_scores, %grad_scores_85 : tensor<1x64xi32> loc(#loc1104) + %grad_scores_87 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_88 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_89 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1105) + %grad_scores_90 = tt.broadcast %grad_scores_86 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1105) + %grad_scores_91 = arith.select %grad_scores_90, %dsT_84, %grad_scores_89 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1105) + %dsT_92 = arith.constant 0.000000e+00 : f32 loc(#loc1106) + %dsT_93 = arith.constant 0.000000e+00 : f32 loc(#loc1106) + %dsT_94 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1106) + %dsT_95 = arith.select %tmp78, %grad_scores_91, %dsT_94 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1106) + %dk_96 = arith.truncf %dsT_95 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1107) + %dk_97 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1108) + %dk_98 = arith.constant 0.000000e+00 : f32 loc(#loc1109) + %dk_99 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1109) + %dk_100 = tt.dot %dk_96, %dk_97, %dk_99, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1109) + %dk_101 = arith.addf %dk, %dk_100 : tensor<128x128xf32> loc(#loc1110) + tt.return %dk_101, %dv_74 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc484) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc485) + %1 = ub.poison : tensor<128x128xf32> loc(#loc485) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc485) + } loc(#loc408) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: tensor<64x128x!tt.ptr> loc("ptr"(#loc237)), %offs_m: tensor<64xi32> loc("offs_m"(#loc237)), %offs_n: tensor<128xi32> loc("offs_n"(#loc237)), %M_LEN: i32 loc("M_LEN"(#loc237))) -> tensor<64x128xbf16> attributes {noinline = false} { + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc244) + %1 = tt.splat %M_LEN : i32 -> tensor<64x1xi32> loc(#loc245) + %2 = arith.cmpi slt, %0, %1 : tensor<64x1xi32> loc(#loc245) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc246) + %3 = tt.broadcast %2 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc246) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32> loc(#loc246) + %4 = arith.truncf %cst_0 : tensor<64x128xf32> to tensor<64x128xbf16> loc(#loc246) + %5 = tt.load %ptr, %3, %4 : tensor<64x128x!tt.ptr> loc(#loc246) + tt.return %5 : tensor<64x128xbf16> loc(#loc247) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x128xbf16> loc(#loc248) + tt.return %6 : tensor<64x128xbf16> loc(#loc248) + } loc(#loc237) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc378)), %arg_K: !tt.ptr loc("arg_K"(#loc378)), %arg_V: !tt.ptr loc("arg_V"(#loc378)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc378)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc378)), %arg_DO: !tt.ptr loc("arg_DO"(#loc378)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc378)), %arg_DV: !tt.ptr loc("arg_DV"(#loc378)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc378)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc378)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc378)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc378)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc378)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc378)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc378)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc378)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc378)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc378)), %ks0: i32 loc("ks0"(#loc378)), %ks1: i32 loc("ks1"(#loc378)), %ks2: i32 loc("ks2"(#loc378)), %ks3: i32 loc("ks3"(#loc378)), %ks4: i32 loc("ks4"(#loc378)), %ks5: i32 loc("ks5"(#loc378)), %ks6: i32 loc("ks6"(#loc378)), %ks7: i32 loc("ks7"(#loc378)), %ks8: i32 loc("ks8"(#loc378)), %Q: !tt.ptr loc("Q"(#loc378)), %DO: !tt.ptr loc("DO"(#loc378)), %DELTA: !tt.ptr loc("DELTA"(#loc378)), %LSE: !tt.ptr loc("LSE"(#loc378)), %dk: tensor<128x128xf32> loc("dk"(#loc378)), %dv: tensor<128x128xf32> loc("dv"(#loc378)), %k: tensor<128x128xbf16> loc("k"(#loc378)), %v: tensor<128x128xbf16> loc("v"(#loc378)), %off_z: i32 loc("off_z"(#loc378)), %off_hq: i32 loc("off_hq"(#loc378)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc378)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc378)), %stride_qm: i32 loc("stride_qm"(#loc378)), %stride_qd: i32 loc("stride_qd"(#loc378)), %stride_dom: i32 loc("stride_dom"(#loc378)), %stride_dod: i32 loc("stride_dod"(#loc378)), %q_indices: !tt.ptr loc("q_indices"(#loc378)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc378))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc962) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc963) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc964) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc965) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc965) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc966) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc966) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc967) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc968) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc968) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc969) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc969) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc969) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc970) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc971) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc971) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc972) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc972) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc973) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc974) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc974) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc975) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc975) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc975) + %hi = arith.constant 2 : i32 loc(#loc976) + %hi_20 = arith.constant 2 : i32 loc(#loc976) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc976) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc977) + %hi_23 = arith.constant 1 : i32 loc(#loc978) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc978) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc979) + %c0_i32 = arith.constant 0 : i32 loc(#loc397) + %c1_i32 = arith.constant 1 : i32 loc(#loc397) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc397) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc397) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc397) + %3 = ub.poison : i32 loc(#loc397) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %ks0, %ks1, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc398) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc981) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc982) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc983) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc983) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc984) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc985) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc985) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc986) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc986) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc405) + } loc(#loc1122) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc406) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc407) + %5 = ub.poison : tensor<128x128xf32> loc(#loc407) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc407) + } loc(#loc378) + tt.func private @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc408)), %arg_K: !tt.ptr loc("arg_K"(#loc408)), %arg_V: !tt.ptr loc("arg_V"(#loc408)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc408)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc408)), %arg_DO: !tt.ptr loc("arg_DO"(#loc408)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc408)), %arg_DV: !tt.ptr loc("arg_DV"(#loc408)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc408)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc408)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc408)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc408)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc408)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc408)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc408)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc408)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc408)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc408)), %ks0: i32 loc("ks0"(#loc408)), %ks1: i32 loc("ks1"(#loc408)), %ks2: i32 loc("ks2"(#loc408)), %ks3: i32 loc("ks3"(#loc408)), %ks4: i32 loc("ks4"(#loc408)), %ks5: i32 loc("ks5"(#loc408)), %ks6: i32 loc("ks6"(#loc408)), %ks7: i32 loc("ks7"(#loc408)), %ks8: i32 loc("ks8"(#loc408)), %dk: tensor<128x128xf32> loc("dk"(#loc408)), %dv: tensor<128x128xf32> loc("dv"(#loc408)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc408)), %k: tensor<128x128xbf16> loc("k"(#loc408)), %v: tensor<128x128xbf16> loc("v"(#loc408)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc408)), %DELTA: !tt.ptr loc("DELTA"(#loc408)), %LSE: !tt.ptr loc("LSE"(#loc408)), %Q_LEN: i32 loc("Q_LEN"(#loc408)), %KV_LEN: i32 loc("KV_LEN"(#loc408)), %off_z: i32 loc("off_z"(#loc408)), %off_hq: i32 loc("off_hq"(#loc408)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc408)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc408)), %offs_k: tensor<128xi32> loc("offs_k"(#loc408)), %offs_v: tensor<128xi32> loc("offs_v"(#loc408)), %stride_qm: i32 loc("stride_qm"(#loc408)), %stride_qd: i32 loc("stride_qd"(#loc408)), %stride_dom: i32 loc("stride_dom"(#loc408)), %stride_dod: i32 loc("stride_dod"(#loc408)), %q_indices: !tt.ptr loc("q_indices"(#loc408)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc408))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc1036) + %lse = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1037) + %lse_0 = arith.cmpi slt, %offs_m1, %lse : tensor<64xi32> loc(#loc1037) + %lse_1 = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1038) + %lse_2 = tt.addptr %lse_1, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1038) + %lse_3 = tt.load %lse_2, %lse_0 : tensor<64x!tt.ptr> loc(#loc1039) + %lse_4 = arith.constant 0xFF800000 : f32 loc(#loc1040) + %lse_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1040) + %lse_6 = arith.cmpf oeq, %lse_3, %lse_5 : tensor<64xf32> loc(#loc1040) + %lse_7 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_8 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_9 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1041) + %lse_10 = arith.select %lse_6, %lse_9, %lse_3 : tensor<64xi1>, tensor<64xf32> loc(#loc1041) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc1042) + %qkT_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1042) + %qkT_12 = tt.dot %k, %qT, %qkT_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1042) + %qkT_13 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_14 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1043) + %qkT_16 = arith.mulf %qkT_12, %qkT_15 : tensor<128x64xf32> loc(#loc1043) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1044) + %m_17 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S1_64S_i32__(%m, %Q_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc1045) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc1046) + %n_18 = tt.call @torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.get_bounded_indices__i32S128_1S_i32__(%n, %KV_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc1047) + %post_mod_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1048) + %post_mod_scores_19 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_20 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_19 : tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_21 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_22 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_23 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1050) + %post_mod_scores_24 = tt.broadcast %post_mod_scores_20 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1050) + %post_mod_scores_25 = arith.select %post_mod_scores_24, %qkT_16, %post_mod_scores_23 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1050) + %post_mod_scores_26 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_27 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_28 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1087) + %post_mod_scores_29 = arith.mulf %post_mod_scores_25, %post_mod_scores_28 : tensor<128x64xf32> loc(#loc1087) + %pT = tt.expand_dims %lse_10 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1088) + %pT_30 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1089) + %pT_31 = arith.subf %post_mod_scores_29, %pT_30 : tensor<128x64xf32> loc(#loc1089) + %pT_32 = math.exp2 %pT_31 : tensor<128x64xf32> loc(#loc1090) + %do = tt.call @"torch._inductor.runtime.compile_tasks.ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1091) + %dv_33 = arith.truncf %pT_32 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1092) + %dv_34 = arith.constant 0.000000e+00 : f32 loc(#loc1093) + %dv_35 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1093) + %dv_36 = tt.dot %dv_33, %do, %dv_35, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1093) + %dv_37 = arith.addf %dv, %dv_36 : tensor<128x128xf32> loc(#loc1094) + %Di = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1095) + %Di_38 = arith.cmpi slt, %offs_m1, %Di : tensor<64xi32> loc(#loc1095) + %Di_39 = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1096) + %Di_40 = tt.addptr %Di_39, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1096) + %Di_41 = tt.load %Di_40, %Di_38 : tensor<64x!tt.ptr> loc(#loc1097) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1098) + %dpT_42 = arith.constant 0.000000e+00 : f32 loc(#loc1099) + %dpT_43 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1099) + %dpT_44 = tt.dot %v, %dpT, %dpT_43, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1099) + %dsT = tt.expand_dims %Di_41 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1100) + %dsT_45 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1101) + %dsT_46 = arith.subf %dpT_44, %dsT_45 : tensor<128x64xf32> loc(#loc1101) + %dsT_47 = arith.mulf %pT_32, %dsT_46 : tensor<128x64xf32> loc(#loc1102) + %grad_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1103) + %grad_scores_48 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1104) + %grad_scores_49 = arith.cmpi slt, %grad_scores, %grad_scores_48 : tensor<1x64xi32> loc(#loc1104) + %grad_scores_50 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_51 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_52 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1105) + %grad_scores_53 = tt.broadcast %grad_scores_49 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1105) + %grad_scores_54 = arith.select %grad_scores_53, %dsT_47, %grad_scores_52 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1105) + %dk_55 = arith.truncf %grad_scores_54 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1107) + %dk_56 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1108) + %dk_57 = arith.constant 0.000000e+00 : f32 loc(#loc1109) + %dk_58 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1109) + %dk_59 = tt.dot %dk_55, %dk_56, %dk_58, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1109) + %dk_60 = arith.addf %dk, %dk_59 : tensor<128x128xf32> loc(#loc1110) + tt.return %dk_60, %dv_37 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc484) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc485) + %1 = ub.poison : tensor<128x128xf32> loc(#loc485) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc485) + } loc(#loc408) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":94:54) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":94:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:63) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:49) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":96:54) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":96:63) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":96:49) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:74) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:66) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:100) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:91) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:82) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:59) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:126) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:118) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:152) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:143) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:134) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:111) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":99:58) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":99:53) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":100:58) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":100:67) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":100:53) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":102:9) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":103:9) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":104:10) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":106:10) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":111:24) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":112:36) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":113:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":115:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":116:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":117:23) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":119:15) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":120:16) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":122:28) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:47) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:35) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:59) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":125:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":125:47) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":125:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":125:59) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:27) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:37) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:61) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":131:9) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":132:9) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":133:10) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":135:14) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":136:26) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":137:26) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:14) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:7) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":140:24) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":142:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":143:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:29) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:54) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:44) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":145:35) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":146:41) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":148:30) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":151:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":152:42) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":152:54) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":154:55) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":154:78) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:50) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:83) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:68) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:30) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:40) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:63) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:32) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:55) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:42) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:66) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":160:32) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":160:55) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":160:42) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":160:66) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:30) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:35) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:46) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:56) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":163:17) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":164:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":167:19) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":168:21) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":169:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":172:22) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":174:36) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":175:42) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":175:29) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":178:107) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":179:111) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:58) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:34) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:57) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:33) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:26) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":190:30) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":190:50) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":191:18) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":195:30) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":196:27) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":196:41) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":197:53) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":197:39) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":199:42) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":199:29) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":207:12) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":214:39) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":215:31) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":215:45) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":216:62) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":216:43) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":218:46) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":218:33) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":226:16) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:32) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:43) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:24) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:63) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:74) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:56) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":232:14) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:48) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:59) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:76) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:87) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:69) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:30) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":239:29) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":240:30) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":242:26) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":245:29) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":249:22) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":250:22) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":252:25) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":253:42) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":253:29) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":256:107) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":257:107) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":262:30) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":263:32) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":263:51) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:34) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:56) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:44) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:67) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:36) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:59) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:46) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:70) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":268:36) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":268:59) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":268:46) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":268:70) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:34) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:39) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:50) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:60) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":271:21) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":272:23) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":275:25) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":276:29) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":278:39) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":279:46) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":279:58) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":281:58) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":281:80) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:53) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:81) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:70) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":286:32) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":287:30) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":287:43) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":288:55) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":288:42) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":290:45) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":290:32) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":298:16) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":306:41) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":307:34) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":307:47) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":308:64) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":308:46) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":310:49) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":310:36) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":318:20) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":303:12) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:31) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:42) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:23) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:62) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:73) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:55) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":325:26) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":326:25) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":327:25) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:50) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:71) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:61) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:30) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":334:14) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":337:29) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:31) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:27) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:45) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:53) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:41) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:64) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:71) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:59) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":345:29) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":345:69) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:4) +#loc228 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc229 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc230 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc232 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc233 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:27) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:38) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:20) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:56) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:67) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:49) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:41) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:52) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:23) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:15) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":828:4) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":387:26) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":388:26) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:26) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:37) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:18) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:56) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:67) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:49) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:26) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:37) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:18) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:56) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:67) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:49) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:43) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:90) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:101) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:63) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":397:28) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":405:12) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":411:64) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":414:28) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":414:19) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":415:28) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":415:19) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":417:19) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":417:8) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":419:11) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":419:4) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":458:105) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":459:19) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":461:14) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":464:36) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":464:46) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":467:36) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":467:46) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":476:43) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":476:54) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":476:79) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":479:35) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":482:23) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":483:23) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":485:34) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":485:23) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":486:22) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":487:23) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":488:23) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":489:23) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":490:23) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":491:23) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":493:24) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":494:24) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":495:32) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":496:25) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":497:92) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":498:92) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":499:25) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":500:24) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":501:24) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":502:39) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":503:25) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":504:24) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":505:24) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":506:23) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":507:25) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":508:25) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":509:92) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":510:25) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":511:24) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":512:24) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":513:39) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":514:25) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":515:24) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":516:24) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":521:69) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":524:27) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":525:39) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":525:21) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":528:104) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":530:20) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:22) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:19) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:14) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":538:39) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":538:50) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":538:71) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":542:32) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":542:43) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":542:62) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":542:73) +#loc341 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":542:54) +#loc342 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":549:43) +#loc343 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":551:15) +#loc344 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:30) +#loc345 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:21) +#loc346 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:10) +#loc347 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":555:11) +#loc348 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":555:4) +#loc349 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:41) +#loc350 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:52) +#loc351 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:23) +#loc352 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:15) +#loc354 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":798:21) +#loc355 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":798:11) +#loc356 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":798:4) +#loc358 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":788:33) +#loc359 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":789:38) +#loc360 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":789:24) +#loc361 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:109) +#loc362 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:113) +#loc363 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:39) +#loc364 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:55) +#loc365 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:25) +#loc366 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:30) +#loc367 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:35) +#loc368 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:60) +#loc369 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:34) +#loc370 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:48) +#loc371 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:63) +#loc372 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:29) +#loc373 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:47) +#loc374 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:61) +#loc375 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:42) +#loc376 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":794:11) +#loc377 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":794:4) +#loc379 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":598:26) +#loc380 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":599:26) +#loc381 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:26) +#loc382 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:37) +#loc383 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:18) +#loc384 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:56) +#loc385 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:67) +#loc386 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:49) +#loc387 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:27) +#loc388 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:38) +#loc389 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:19) +#loc390 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:58) +#loc391 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:69) +#loc392 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:51) +#loc393 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:42) +#loc394 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:87) +#loc395 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:98) +#loc396 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:61) +#loc397 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":610:28) +#loc398 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":618:12) +#loc399 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":623:62) +#loc400 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":626:28) +#loc401 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":626:19) +#loc402 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":627:28) +#loc403 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":627:19) +#loc404 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":628:19) +#loc405 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":628:8) +#loc406 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":630:11) +#loc407 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":630:4) +#loc409 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":669:105) +#loc410 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:52) +#loc411 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:28) +#loc412 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:22) +#loc413 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":675:26) +#loc414 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":675:46) +#loc415 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":676:20) +#loc416 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":678:15) +#loc417 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":680:36) +#loc418 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":680:46) +#loc419 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":683:36) +#loc420 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":683:46) +#loc421 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":692:43) +#loc422 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":692:54) +#loc423 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":692:78) +#loc424 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":695:36) +#loc425 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":698:25) +#loc426 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":699:25) +#loc427 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":701:35) +#loc428 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":701:24) +#loc429 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":702:24) +#loc430 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":703:25) +#loc431 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":704:24) +#loc432 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":705:24) +#loc433 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":706:24) +#loc434 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":707:24) +#loc435 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":709:25) +#loc436 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":710:25) +#loc437 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":711:32) +#loc438 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":712:25) +#loc439 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":713:92) +#loc440 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":714:92) +#loc441 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":715:25) +#loc442 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":716:24) +#loc443 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":717:24) +#loc444 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":718:39) +#loc445 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":719:25) +#loc446 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":720:24) +#loc447 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":721:24) +#loc448 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":722:24) +#loc449 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":723:25) +#loc450 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":724:25) +#loc451 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":725:92) +#loc452 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":726:25) +#loc453 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":727:24) +#loc454 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":728:24) +#loc455 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":729:39) +#loc456 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":730:25) +#loc457 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":731:24) +#loc458 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":732:24) +#loc459 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":736:69) +#loc460 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":739:27) +#loc461 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:44) +#loc462 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:40) +#loc463 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:22) +#loc464 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":741:99) +#loc465 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:24) +#loc466 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:43) +#loc467 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:10) +#loc468 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:53) +#loc469 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:29) +#loc470 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:21) +#loc471 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":750:29) +#loc472 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":750:20) +#loc473 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:25) +#loc474 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:22) +#loc475 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:16) +#loc476 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":759:39) +#loc477 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":759:50) +#loc478 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":759:70) +#loc479 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":773:45) +#loc480 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:24) +#loc481 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:52) +#loc482 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:43) +#loc483 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:10) +#loc484 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":777:11) +#loc485 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":777:4) +#loc513 = loc("ZQ"(#loc27)) +#loc514 = loc("HQ"(#loc28)) +#loc515 = loc("HKV"(#loc29)) +#loc516 = loc("ZKV"(#loc30)) +#loc517 = loc("pid"(#loc31)) +#loc518 = loc("NUM_KV_BLOCKS"(#loc32)) +#loc519 = loc("NUM_Q_BLOCKS"(#loc33)) +#loc520 = loc("off_zq"(#loc34)) +#loc521 = loc("off_hkv"(#loc35)) +#loc522 = loc("off_zkv"(#loc36)) +#loc523 = loc("SPARSE_Z"(#loc37)) +#loc524 = loc("SPARSE_HQ"(#loc38)) +#loc525 = loc("sparse_idx_z"(#loc39)) +#loc526 = loc("k_adj"(#loc40)) +#loc527 = loc("k_adj"(#loc41)) +#loc528 = loc("k_adj"(#loc42)) +#loc529 = loc("k_adj"(#loc43)) +#loc530 = loc("v_adj"(#loc44)) +#loc531 = loc("v_adj"(#loc45)) +#loc532 = loc("v_adj"(#loc46)) +#loc533 = loc("v_adj"(#loc47)) +#loc534 = loc("dv_adj"(#loc48)) +#loc535 = loc("dv_adj"(#loc49)) +#loc536 = loc("dv_adj"(#loc50)) +#loc537 = loc("dv_adj"(#loc51)) +#loc538 = loc("K"(#loc52)) +#loc539 = loc("V"(#loc53)) +#loc540 = loc("DV"(#loc54)) +#loc541 = loc("RCP_LN2"(#loc55)) +#loc542 = loc("offs_k"(#loc56)) +#loc543 = loc("offs_v"(#loc57)) +#loc544 = loc("off_pid"(#loc60)) +#loc545 = loc("SPARSE_Q_MULTIPLE"(#loc61)) +#loc546 = loc("SPARSE_KV_MULTIPLE"(#loc62)) +#loc547 = loc("off_hq2"(#loc63)) +#loc548 = loc("off_hq2"(#loc64)) +#loc549 = loc("off_hq2"(#loc65)) +#loc550 = loc("start_m2_block"(#loc66)) +#loc551 = loc("off_pid_mask"(#loc67)) +#loc552 = loc("stride_kv_idx_h"(#loc68)) +#loc553 = loc("sparse_idx_hq2"(#loc69)) +#loc554 = loc("sparse_hz_offset"(#loc70)) +#loc555 = loc("sparse_hz_offset"(#loc71)) +#loc556 = loc("sparse_kv_num_blks_offset"(#loc72)) +#loc557 = loc("sparse_kv_num_blks_offset"(#loc73)) +#loc558 = loc("sparse_kv_idx_offset"(#loc74)) +#loc559 = loc("sparse_kv_idx_offset"(#loc75)) +#loc560 = loc("sparse_kv_idx_offset"(#loc76)) +#loc561 = loc("q_adj2"(#loc77)) +#loc562 = loc("q_adj2"(#loc78)) +#loc563 = loc("q_adj2"(#loc79)) +#loc564 = loc("q_adj2"(#loc80)) +#loc565 = loc("do_adj2"(#loc81)) +#loc566 = loc("do_adj2"(#loc82)) +#loc567 = loc("do_adj2"(#loc83)) +#loc568 = loc("do_adj2"(#loc84)) +#loc569 = loc("dq_adj2"(#loc85)) +#loc570 = loc("dq_adj2"(#loc86)) +#loc571 = loc("dq_adj2"(#loc87)) +#loc572 = loc("dq_adj2"(#loc88)) +#loc573 = loc("off_chz2"(#loc89)) +#loc574 = loc("off_chz2"(#loc90)) +#loc575 = loc("off_chz2"(#loc91)) +#loc576 = loc("off_chz2"(#loc92)) +#loc577 = loc("Q2"(#loc93)) +#loc578 = loc("DO2"(#loc94)) +#loc579 = loc("DQ2"(#loc95)) +#loc580 = loc("LSE2"(#loc96)) +#loc581 = loc("DELTA2"(#loc97)) +#loc582 = loc("dq"(#loc98)) +#loc583 = loc("start_m2"(#loc99)) +#loc584 = loc("offs_m2"(#loc100)) +#loc585 = loc("offs_m2"(#loc101)) +#loc586 = loc("q"(#loc102)) +#loc587 = loc("do"(#loc103)) +#loc588 = loc("Di"(#loc104)) +#loc589 = loc("Di"(#loc105)) +#loc590 = loc("Di"(#loc106)) +#loc591 = loc("lse"(#loc107)) +#loc592 = loc("lse"(#loc108)) +#loc593 = loc("lse"(#loc109)) +#loc594 = loc("lse"(#loc110)) +#loc595 = loc("lse"(#loc111)) +#loc596 = loc("lse"(#loc112)) +#loc597 = loc("kv_indices"(#loc113)) +#loc598 = loc("kv_start"(#loc114)) +#loc599 = loc("kv_start"(#loc115)) +#loc600 = loc("sparse_kv_num_blocks"(#loc116)) +#loc601 = loc("sparse_kv_num_blocks"(#loc117)) +#loc602 = loc("offs_n2"(#loc118)) +#loc603 = loc("offs_n2"(#loc119)) +#loc604 = loc("dq"(#loc120)) +#loc605 = loc("kv_indices"(#loc121)) +#loc606 = loc("kv_start"(#loc122)) +#loc607 = loc("kv_start"(#loc123)) +#loc608 = loc("sparse_kv_num_blocks"(#loc124)) +#loc609 = loc("sparse_kv_num_blocks"(#loc125)) +#loc610 = loc("offs_n2"(#loc126)) +#loc611 = loc("offs_n2"(#loc127)) +#loc612 = loc("dq"(#loc128)) +#loc613 = loc("dq_ptrs"(#loc129)) +#loc614 = loc("dq_ptrs"(#loc130)) +#loc615 = loc("dq_ptrs"(#loc131)) +#loc616 = loc("dq_ptrs"(#loc132)) +#loc617 = loc("dq_ptrs"(#loc133)) +#loc618 = loc("dq_ptrs"(#loc134)) +#loc619 = loc("dq"(#loc135)) +#loc620 = loc("SPARSE_Q_MULTIPLE"(#loc142)) +#loc621 = loc("SPARSE_KV_MULTIPLE"(#loc143)) +#loc622 = loc("pid_mask"(#loc144)) +#loc623 = loc("stride_q_idx_h"(#loc145)) +#loc624 = loc("dv"(#loc146)) +#loc625 = loc("dk"(#loc147)) +#loc626 = loc("start_n1"(#loc148)) +#loc627 = loc("offs_n1"(#loc149)) +#loc628 = loc("offs_n1"(#loc150)) +#loc629 = loc("k"(#loc151)) +#loc630 = loc("v"(#loc152)) +#loc631 = loc("dv"(#loc153)) +#loc632 = loc("off_hq1"(#loc154)) +#loc633 = loc("off_hq1"(#loc155)) +#loc634 = loc("q_adj1"(#loc156)) +#loc635 = loc("q_adj1"(#loc157)) +#loc636 = loc("q_adj1"(#loc158)) +#loc637 = loc("q_adj1"(#loc159)) +#loc638 = loc("do_adj1"(#loc160)) +#loc639 = loc("do_adj1"(#loc161)) +#loc640 = loc("do_adj1"(#loc162)) +#loc641 = loc("do_adj1"(#loc163)) +#loc642 = loc("dq_adj1"(#loc164)) +#loc643 = loc("dq_adj1"(#loc165)) +#loc644 = loc("dq_adj1"(#loc166)) +#loc645 = loc("dq_adj1"(#loc167)) +#loc646 = loc("off_chz1"(#loc168)) +#loc647 = loc("off_chz1"(#loc169)) +#loc648 = loc("off_chz1"(#loc170)) +#loc649 = loc("off_chz1"(#loc171)) +#loc650 = loc("Q1"(#loc172)) +#loc651 = loc("DO1"(#loc173)) +#loc652 = loc("LSE1"(#loc174)) +#loc653 = loc("DELTA1"(#loc175)) +#loc654 = loc("sparse_idx_hq1"(#loc176)) +#loc655 = loc("sparse_hz_offset"(#loc177)) +#loc656 = loc("sparse_hz_offset"(#loc178)) +#loc657 = loc("sparse_q_num_blks_offset"(#loc179)) +#loc658 = loc("sparse_q_num_blks_offset"(#loc180)) +#loc659 = loc("sparse_q_idx_offset"(#loc181)) +#loc660 = loc("sparse_q_idx_offset"(#loc182)) +#loc661 = loc("sparse_q_idx_offset"(#loc183)) +#loc662 = loc("q_indices"(#loc184)) +#loc663 = loc("q_start"(#loc185)) +#loc664 = loc("q_start"(#loc186)) +#loc665 = loc("sparse_q_num_blocks"(#loc187)) +#loc666 = loc("sparse_q_num_blocks"(#loc188)) +#loc667 = loc("offs_m1"(#loc189)) +#loc668 = loc("offs_m1"(#loc190)) +#loc669 = loc("q_indices"(#loc192)) +#loc670 = loc("q_start"(#loc193)) +#loc671 = loc("q_start"(#loc194)) +#loc672 = loc("sparse_q_num_blocks"(#loc195)) +#loc673 = loc("sparse_q_num_blocks"(#loc196)) +#loc674 = loc("offs_m1"(#loc197)) +#loc675 = loc("offs_m1"(#loc198)) +#loc676 = loc("dv_ptrs"(#loc201)) +#loc677 = loc("dv_ptrs"(#loc202)) +#loc678 = loc("dv_ptrs"(#loc203)) +#loc679 = loc("dv_ptrs"(#loc204)) +#loc680 = loc("dv_ptrs"(#loc205)) +#loc681 = loc("dv_ptrs"(#loc206)) +#loc682 = loc("index_n"(#loc207)) +#loc683 = loc("index_k"(#loc208)) +#loc684 = loc("index_v"(#loc209)) +#loc685 = loc("dk"(#loc214)) +#loc686 = loc("mask"(#loc215)) +#loc687 = loc("xindex"(#loc216)) +#loc688 = loc("xindex"(#loc217)) +#loc689 = loc("xindex"(#loc218)) +#loc690 = loc("xindex"(#loc219)) +#loc691 = loc("xindex"(#loc220)) +#loc692 = loc("xindex"(#loc221)) +#loc693 = loc("xindex"(#loc222)) +#loc694 = loc("xindex"(#loc223)) +#loc702 = loc("ptr"(#loc238)) +#loc703 = loc("ptr"(#loc239)) +#loc704 = loc("ptr"(#loc240)) +#loc705 = loc("ptr"(#loc241)) +#loc706 = loc("ptr"(#loc242)) +#loc707 = loc("ptr"(#loc243)) +#loc752 = loc("offs_k"(#loc250)) +#loc753 = loc("offs_v"(#loc251)) +#loc754 = loc("kT_ptrs"(#loc252)) +#loc755 = loc("kT_ptrs"(#loc253)) +#loc756 = loc("kT_ptrs"(#loc254)) +#loc757 = loc("kT_ptrs"(#loc255)) +#loc758 = loc("kT_ptrs"(#loc256)) +#loc759 = loc("kT_ptrs"(#loc257)) +#loc760 = loc("vT_ptrs"(#loc258)) +#loc761 = loc("vT_ptrs"(#loc259)) +#loc762 = loc("vT_ptrs"(#loc260)) +#loc763 = loc("vT_ptrs"(#loc261)) +#loc764 = loc("vT_ptrs"(#loc262)) +#loc765 = loc("vT_ptrs"(#loc263)) +#loc766 = loc("hi"(#loc264)) +#loc767 = loc("hi"(#loc265)) +#loc768 = loc("hi"(#loc266)) +#loc769 = loc("hi"(#loc267)) +#loc770 = loc("dq"(#loc268)) +#loc771 = loc("dq"(#loc269)) +#loc772 = loc("offset"(#loc270)) +#loc773 = loc("kT_ptrs"(#loc271)) +#loc774 = loc("kT_ptrs"(#loc272)) +#loc775 = loc("vT_ptrs"(#loc273)) +#loc776 = loc("vT_ptrs"(#loc274)) +#loc777 = loc("offs_n2"(#loc275)) +#loc826 = loc("kT"(#loc280)) +#loc827 = loc("qk"(#loc281)) +#loc828 = loc("qk"(#loc282)) +#loc829 = loc("n"(#loc283)) +#loc830 = loc("n"(#loc284)) +#loc831 = loc("m"(#loc285)) +#loc832 = loc("m"(#loc286)) +#loc833 = loc("post_mod_scores"(#loc287)) +#loc834 = loc("post_mod_scores"(#loc288)) +#loc835 = loc("post_mod_scores"(#loc289)) +#loc836 = loc("tmp1"(#loc290)) +#loc837 = loc("tmp4"(#loc291)) +#loc838 = loc("tmp5"(#loc292)) +#loc839 = loc("tmp7"(#loc293)) +#loc840 = loc("tmp7"(#loc294)) +#loc841 = loc("tmp8"(#loc295)) +#loc842 = loc("tmp9"(#loc296)) +#loc843 = loc("tmp10"(#loc297)) +#loc844 = loc("tmp11"(#loc298)) +#loc845 = loc("tmp12"(#loc299)) +#loc846 = loc("tmp13"(#loc300)) +#loc847 = loc("tmp15"(#loc301)) +#loc848 = loc("tmp16"(#loc302)) +#loc849 = loc("tmp17"(#loc303)) +#loc850 = loc("tmp18"(#loc304)) +#loc851 = loc("tmp19"(#loc305)) +#loc852 = loc("tmp20"(#loc306)) +#loc853 = loc("tmp21"(#loc307)) +#loc854 = loc("tmp22"(#loc308)) +#loc855 = loc("tmp23"(#loc309)) +#loc856 = loc("tmp24"(#loc310)) +#loc857 = loc("tmp25"(#loc311)) +#loc858 = loc("tmp26"(#loc312)) +#loc859 = loc("tmp27"(#loc313)) +#loc860 = loc("tmp28"(#loc314)) +#loc861 = loc("tmp29"(#loc315)) +#loc862 = loc("tmp30"(#loc316)) +#loc863 = loc("tmp31"(#loc317)) +#loc864 = loc("tmp32"(#loc318)) +#loc865 = loc("tmp33"(#loc319)) +#loc866 = loc("tmp34"(#loc320)) +#loc867 = loc("tmp35"(#loc321)) +#loc868 = loc("tmp36"(#loc322)) +#loc869 = loc("tmp37"(#loc323)) +#loc870 = loc("tmp38"(#loc324)) +#loc871 = loc("post_mod_scores"(#loc325)) +#loc872 = loc("post_mod_scores"(#loc326)) +#loc873 = loc("p"(#loc327)) +#loc874 = loc("p"(#loc328)) +#loc875 = loc("vT"(#loc329)) +#loc876 = loc("dp"(#loc330)) +#loc877 = loc("ds"(#loc331)) +#loc878 = loc("ds"(#loc332)) +#loc879 = loc("ds"(#loc333)) +#loc880 = loc("grad_scores"(#loc334)) +#loc881 = loc("grad_scores"(#loc335)) +#loc882 = loc("grad_scores"(#loc336)) +#loc883 = loc("scatter_mask"(#loc337)) +#loc884 = loc("scatter_mask"(#loc338)) +#loc885 = loc("scatter_mask"(#loc339)) +#loc886 = loc("scatter_mask"(#loc340)) +#loc887 = loc("scatter_mask"(#loc341)) +#loc888 = loc("ds"(#loc342)) +#loc889 = loc("ds"(#loc343)) +#loc890 = loc("dq"(#loc344)) +#loc891 = loc("dq"(#loc345)) +#loc892 = loc("dq"(#loc346)) +#loc899 = loc("cur_block_idx"(#loc358)) +#loc900 = loc("cur_block"(#loc359)) +#loc901 = loc("cur_block"(#loc360)) +#loc902 = loc("next_block"(#loc361)) +#loc903 = loc("next_block"(#loc362)) +#loc904 = loc("next_block"(#loc363)) +#loc905 = loc("next_block"(#loc364)) +#loc906 = loc("next_block"(#loc365)) +#loc907 = loc("needs_jump"(#loc366)) +#loc908 = loc("needs_jump"(#loc367)) +#loc909 = loc("needs_jump"(#loc368)) +#loc910 = loc("jump_to_block"(#loc369)) +#loc911 = loc("jump_to_block"(#loc370)) +#loc912 = loc("jump_to_block"(#loc371)) +#loc913 = loc("offset"(#loc372)) +#loc914 = loc("offset"(#loc373)) +#loc915 = loc("offset"(#loc374)) +#loc916 = loc("offset"(#loc375)) +#loc962 = loc("offs_k"(#loc379)) +#loc963 = loc("offs_v"(#loc380)) +#loc964 = loc("qT_ptrs"(#loc381)) +#loc965 = loc("qT_ptrs"(#loc382)) +#loc966 = loc("qT_ptrs"(#loc383)) +#loc967 = loc("qT_ptrs"(#loc384)) +#loc968 = loc("qT_ptrs"(#loc385)) +#loc969 = loc("qT_ptrs"(#loc386)) +#loc970 = loc("do_ptrs"(#loc387)) +#loc971 = loc("do_ptrs"(#loc388)) +#loc972 = loc("do_ptrs"(#loc389)) +#loc973 = loc("do_ptrs"(#loc390)) +#loc974 = loc("do_ptrs"(#loc391)) +#loc975 = loc("do_ptrs"(#loc392)) +#loc976 = loc("hi"(#loc393)) +#loc977 = loc("hi"(#loc394)) +#loc978 = loc("hi"(#loc395)) +#loc979 = loc("hi"(#loc396)) +#loc980 = loc("dk"(#loc397)) +#loc981 = loc("offset"(#loc399)) +#loc982 = loc("qT_ptrs"(#loc400)) +#loc983 = loc("qT_ptrs"(#loc401)) +#loc984 = loc("do_ptrs"(#loc402)) +#loc985 = loc("do_ptrs"(#loc403)) +#loc986 = loc("offs_m1"(#loc404)) +#loc1036 = loc("qT"(#loc409)) +#loc1037 = loc("lse"(#loc410)) +#loc1038 = loc("lse"(#loc411)) +#loc1039 = loc("lse"(#loc412)) +#loc1040 = loc("lse"(#loc413)) +#loc1041 = loc("lse"(#loc414)) +#loc1042 = loc("qkT"(#loc415)) +#loc1043 = loc("qkT"(#loc416)) +#loc1044 = loc("m"(#loc417)) +#loc1045 = loc("m"(#loc418)) +#loc1046 = loc("n"(#loc419)) +#loc1047 = loc("n"(#loc420)) +#loc1048 = loc("post_mod_scores"(#loc421)) +#loc1049 = loc("post_mod_scores"(#loc422)) +#loc1050 = loc("post_mod_scores"(#loc423)) +#loc1051 = loc("tmp41"(#loc424)) +#loc1052 = loc("tmp44"(#loc425)) +#loc1053 = loc("tmp45"(#loc426)) +#loc1054 = loc("tmp47"(#loc427)) +#loc1055 = loc("tmp47"(#loc428)) +#loc1056 = loc("tmp48"(#loc429)) +#loc1057 = loc("tmp49"(#loc430)) +#loc1058 = loc("tmp50"(#loc431)) +#loc1059 = loc("tmp51"(#loc432)) +#loc1060 = loc("tmp52"(#loc433)) +#loc1061 = loc("tmp53"(#loc434)) +#loc1062 = loc("tmp55"(#loc435)) +#loc1063 = loc("tmp56"(#loc436)) +#loc1064 = loc("tmp57"(#loc437)) +#loc1065 = loc("tmp58"(#loc438)) +#loc1066 = loc("tmp59"(#loc439)) +#loc1067 = loc("tmp60"(#loc440)) +#loc1068 = loc("tmp61"(#loc441)) +#loc1069 = loc("tmp62"(#loc442)) +#loc1070 = loc("tmp63"(#loc443)) +#loc1071 = loc("tmp64"(#loc444)) +#loc1072 = loc("tmp65"(#loc445)) +#loc1073 = loc("tmp66"(#loc446)) +#loc1074 = loc("tmp67"(#loc447)) +#loc1075 = loc("tmp68"(#loc448)) +#loc1076 = loc("tmp69"(#loc449)) +#loc1077 = loc("tmp70"(#loc450)) +#loc1078 = loc("tmp71"(#loc451)) +#loc1079 = loc("tmp72"(#loc452)) +#loc1080 = loc("tmp73"(#loc453)) +#loc1081 = loc("tmp74"(#loc454)) +#loc1082 = loc("tmp75"(#loc455)) +#loc1083 = loc("tmp76"(#loc456)) +#loc1084 = loc("tmp77"(#loc457)) +#loc1085 = loc("tmp78"(#loc458)) +#loc1086 = loc("post_mod_scores"(#loc459)) +#loc1087 = loc("post_mod_scores"(#loc460)) +#loc1088 = loc("pT"(#loc461)) +#loc1089 = loc("pT"(#loc462)) +#loc1090 = loc("pT"(#loc463)) +#loc1091 = loc("do"(#loc464)) +#loc1092 = loc("dv"(#loc465)) +#loc1093 = loc("dv"(#loc466)) +#loc1094 = loc("dv"(#loc467)) +#loc1095 = loc("Di"(#loc468)) +#loc1096 = loc("Di"(#loc469)) +#loc1097 = loc("Di"(#loc470)) +#loc1098 = loc("dpT"(#loc471)) +#loc1099 = loc("dpT"(#loc472)) +#loc1100 = loc("dsT"(#loc473)) +#loc1101 = loc("dsT"(#loc474)) +#loc1102 = loc("dsT"(#loc475)) +#loc1103 = loc("grad_scores"(#loc476)) +#loc1104 = loc("grad_scores"(#loc477)) +#loc1105 = loc("grad_scores"(#loc478)) +#loc1106 = loc("dsT"(#loc479)) +#loc1107 = loc("dk"(#loc480)) +#loc1108 = loc("dk"(#loc481)) +#loc1109 = loc("dk"(#loc482)) +#loc1110 = loc("dk"(#loc483)) +#loc1111 = loc("SPARSE_Q_MULTIPLE"(#loc545)) +#loc1112 = loc("SPARSE_KV_MULTIPLE"(#loc546)) +#loc1113 = loc("SPARSE_Q_MULTIPLE"(#loc620)) +#loc1114 = loc("SPARSE_KV_MULTIPLE"(#loc621)) +#loc1115 = loc("dk"(#loc631)) +#loc1116 = loc("offs_n2"(#loc770)) +#loc1117 = loc("dv"(#loc980)) +#loc1118 = loc("kT_ptrs"(#loc1116)) +#loc1119 = loc("offs_m1"(#loc1117)) +#loc1120 = loc("vT_ptrs"(#loc1118)) +#loc1121 = loc("qT_ptrs"(#loc1119)) +#loc1122 = loc("do_ptrs"(#loc1121)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..bbedc4af0caab753b1e650673230f1b02e7080bd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttgir @@ -0,0 +1,2069 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":18:0) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> +#smem = #ttg.shared_memory +#loc338 = loc("arg_Q"(#loc)) +#loc339 = loc("arg_K"(#loc)) +#loc340 = loc("arg_V"(#loc)) +#loc341 = loc("arg_LSE"(#loc)) +#loc342 = loc("arg_DELTA"(#loc)) +#loc343 = loc("arg_DO"(#loc)) +#loc344 = loc("arg_DQ"(#loc)) +#loc345 = loc("arg_DV"(#loc)) +#loc346 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc347 = loc("arg_KV_IDX"(#loc)) +#loc348 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc349 = loc("arg_Q_IDX"(#loc)) +#loc350 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc351 = loc("arg_FULL_KV_IDX"(#loc)) +#loc352 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc353 = loc("arg_FULL_Q_IDX"(#loc)) +#loc354 = loc("in_ptr16"(#loc)) +#loc355 = loc("out_ptr0"(#loc)) +#loc356 = loc("ks0"(#loc)) +#loc357 = loc("ks1"(#loc)) +#loc358 = loc("ks2"(#loc)) +#loc359 = loc("ks3"(#loc)) +#loc360 = loc("ks4"(#loc)) +#loc361 = loc("ks5"(#loc)) +#loc362 = loc("ks6"(#loc)) +#loc363 = loc("ks7"(#loc)) +#loc364 = loc("ks8"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc)), %ks6: i32 loc("ks6"(#loc)), %ks7: i32 loc("ks7"(#loc)), %ks8: i32 loc("ks8"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x64xbf16, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.0883883461> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_9 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_10 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_11 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %cst_12 = arith.constant dense<8192> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_13 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc1) + %cst_14 = arith.constant dense<262144> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<8192> : tensor<64x128xi32, #blocked> loc(#loc1) + %cst_16 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_17 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc1) + %cst_18 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_20 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_21 = arith.constant dense<0xFF800000> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_22 = arith.constant dense<0.000000e+00> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_23 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_24 = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_25 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_26 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2) + %1 = arith.muli %ks1, %c1024_i32 : i32 loc(#loc3) + %2 = arith.muli %ks1, %c128_i32 : i32 loc(#loc4) + %3 = arith.cmpi sle, %ks0, %c1_i32 : i32 loc(#loc5) + %4 = arith.extui %3 : i1 to i32 loc(#loc6) + %5 = arith.cmpi sgt, %ks0, %c1_i32 : i32 loc(#loc7) + %6 = arith.extui %5 : i1 to i32 loc(#loc8) + %7 = arith.muli %ks0, %6 : i32 loc(#loc8) + %8 = arith.addi %4, %7 : i32 loc(#loc9) + %9 = arith.muli %8, %c4096_i32 : i32 loc(#loc10) + %10 = arith.muli %8, %c128_i32 : i32 loc(#loc11) + %pid = tt.get_program_id x : i32 loc(#loc365) + %NUM_KV_BLOCKS = arith.addi %ks1, %c127_i32 : i32 loc(#loc669) + %NUM_KV_BLOCKS_27 = arith.divsi %NUM_KV_BLOCKS, %c128_i32 : i32 loc(#loc670) + %NUM_Q_BLOCKS = arith.addi %ks0, %c127_i32 : i32 loc(#loc671) + %NUM_Q_BLOCKS_28 = arith.divsi %NUM_Q_BLOCKS, %c128_i32 : i32 loc(#loc672) + %off_zq = tt.get_program_id y : i32 loc(#loc368) + %off_hkv = tt.get_program_id z : i32 loc(#loc369) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc370) + %k_adj = arith.muli %2, %off_hkv : i32 loc(#loc371) + %k_adj_29 = arith.muli %1, %off_zkv : i32 loc(#loc372) + %k_adj_30 = arith.addi %k_adj, %k_adj_29 : i32 loc(#loc373) + %k_adj_31 = arith.extsi %k_adj_30 : i32 to i64 loc(#loc374) + %dv_adj = arith.muli %1, %off_zq : i32 loc(#loc375) + %dv_adj_32 = arith.addi %k_adj, %dv_adj : i32 loc(#loc376) + %dv_adj_33 = arith.extsi %dv_adj_32 : i32 to i64 loc(#loc377) + %K = tt.addptr %arg_K, %k_adj_31 : !tt.ptr, i64 loc(#loc378) + %V = tt.addptr %arg_V, %k_adj_31 : !tt.ptr, i64 loc(#loc379) + %DV = tt.addptr %arg_DV, %dv_adj_33 : !tt.ptr, i64 loc(#loc380) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc381) + %offs_k_34 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc381) + %11 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS_27 : i32 loc(#loc31) + scf.if %11 { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS_27 : i32 loc(#loc382) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS_28 : i32 loc(#loc383) + %off_hq2_35 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc384) + %off_hq2_36 = arith.addi %off_hq2, %off_hq2_35 : i32 loc(#loc385) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS_28 : i32 loc(#loc386) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc387) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc388) + %sparse_kv_num_blks_offset_37 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc389) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc390) + %sparse_kv_idx_offset_38 = arith.muli %start_m2_block, %ks4 : i32 loc(#loc391) + %sparse_kv_idx_offset_39 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_38 : i32 loc(#loc392) + %q_adj2 = arith.muli %off_hq2_36, %c128_i32 : i32 loc(#loc393) + %q_adj2_40 = arith.muli %0, %off_zq : i32 loc(#loc394) + %q_adj2_41 = arith.addi %q_adj2, %q_adj2_40 : i32 loc(#loc395) + %q_adj2_42 = arith.extsi %q_adj2_41 : i32 to i64 loc(#loc396) + %do_adj2 = arith.muli %10, %off_hq2_36 : i32 loc(#loc397) + %do_adj2_43 = arith.muli %9, %off_zq : i32 loc(#loc398) + %do_adj2_44 = arith.addi %do_adj2, %do_adj2_43 : i32 loc(#loc399) + %do_adj2_45 = arith.extsi %do_adj2_44 : i32 to i64 loc(#loc400) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc401) + %off_chz2_46 = arith.addi %off_chz2, %off_hq2_36 : i32 loc(#loc402) + %off_chz2_47 = arith.muli %off_chz2_46, %ks0 : i32 loc(#loc403) + %off_chz2_48 = arith.extsi %off_chz2_47 : i32 to i64 loc(#loc404) + %Q2 = tt.addptr %arg_Q, %q_adj2_42 : !tt.ptr, i64 loc(#loc405) + %DO2 = tt.addptr %arg_DO, %do_adj2_45 : !tt.ptr, i64 loc(#loc406) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_42 : !tt.ptr, i64 loc(#loc407) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_48 : !tt.ptr, i64 loc(#loc408) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_48 : !tt.ptr, i64 loc(#loc409) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc410) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc411) + %offs_m2_49 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc411) + %offs_m2_50 = arith.addi %offs_m2, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc411) + %offs_m2_51 = arith.addi %offs_m2_49, %offs_k_34 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc411) + %ptr = tt.expand_dims %offs_m2_50 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc673) + %ptr_52 = tt.expand_dims %offs_m2_51 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc673) + %ptr_53 = arith.muli %ptr, %cst_2 : tensor<128x1xi32, #blocked> loc(#loc674) + %ptr_54 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc675) + %ptr_55 = tt.addptr %ptr_54, %ptr_53 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc675) + %ptr_56 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc676) + %ptr_57 = tt.expand_dims %ptr_56 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc676) + %ptr_58 = tt.broadcast %ptr_55 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc677) + %ptr_59 = tt.broadcast %ptr_57 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc677) + %ptr_60 = tt.addptr %ptr_58, %ptr_59 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc677) + %q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc678) + %q_61 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc678) + %q_62 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc678) + %q_63 = tt.broadcast %q_62 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc679) + %q_64 = tt.load %ptr_60, %q_63, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc679) + %q_65 = ttg.local_alloc %q_64 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc679) + %ptr_66 = arith.muli %ptr, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc680) + %ptr_67 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc681) + %ptr_68 = tt.addptr %ptr_67, %ptr_66 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc681) + %ptr_69 = tt.broadcast %ptr_68 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc682) + %ptr_70 = tt.addptr %ptr_69, %ptr_59 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc682) + %do = tt.load %ptr_70, %q_63, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc683) + %do_71 = ttg.local_alloc %do : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc683) + %Di = tt.splat %ks0 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc419) + %Di_72 = arith.cmpi slt, %offs_m2_51, %Di : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc419) + %Di_73 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc420) + %Di_74 = tt.addptr %Di_73, %offs_m2_51 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc420) + %Di_75 = tt.load %Di_74, %Di_72 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc421) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc422) + %lse_76 = tt.addptr %lse, %offs_m2_51 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc422) + %lse_77 = tt.load %lse_76, %Di_72 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc423) + %lse_78 = arith.cmpf oeq, %lse_77, %cst_26 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc424) + %lse_79 = arith.select %lse_78, %cst_25, %lse_77 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc425) + %lse_80 = tt.expand_dims %lse_79 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc426) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_39 : !tt.ptr, i32 loc(#loc427) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc428) + %kv_start_81 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc429) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_37 : !tt.ptr, i32 loc(#loc430) + %sparse_kv_num_blocks_82 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc431) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc432) + %offs_n2_83 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc432) + %offs_n2_84 = tt.splat %kv_start_81 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc433) + %offs_n2_85 = tt.splat %kv_start_81 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc433) + %offs_n2_86 = arith.addi %offs_n2_84, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc433) + %offs_n2_87 = arith.addi %offs_n2_85, %offs_n2_83 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc433) + %kT_ptrs = tt.expand_dims %offs_n2_87 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc684) + %kT_ptrs_88 = arith.muli %kT_ptrs, %cst_20 : tensor<1x64xi32, #blocked1> loc(#loc685) + %kT_ptrs_89 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc686) + %kT_ptrs_90 = tt.addptr %kT_ptrs_89, %kT_ptrs_88 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc686) + %kT_ptrs_91 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc687) + %kT_ptrs_92 = tt.expand_dims %kT_ptrs_91 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc687) + %kT_ptrs_93 = tt.broadcast %kT_ptrs_90 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc688) + %kT_ptrs_94 = tt.broadcast %kT_ptrs_92 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc688) + %kT_ptrs_95 = tt.addptr %kT_ptrs_93, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc688) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc689) + %vT_ptrs_96 = tt.addptr %vT_ptrs, %kT_ptrs_88 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc689) + %vT_ptrs_97 = tt.broadcast %vT_ptrs_96 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc690) + %vT_ptrs_98 = tt.addptr %vT_ptrs_97, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc690) + %hi = arith.muli %sparse_kv_num_blocks_82, %c2_i32 : i32 loc(#loc691) + %hi_99 = arith.addi %ks1, %c63_i32 : i32 loc(#loc853) + %hi_100 = arith.divsi %hi_99, %c64_i32 : i32 loc(#loc854) + %hi_101 = arith.maxsi %hi_100, %c1_i32 : i32 loc(#loc693) + %hi_102 = arith.minsi %hi, %hi_101 : i32 loc(#loc694) + %kT = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc1008) + %kT_103 = tt.splat %ks1 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc1008) + %m = arith.remsi %ptr_52, %q_61 : tensor<128x1xi32, #mma> loc(#loc1009) + %tmp4 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc857) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc858) + %vT_ptrs_104 = arith.cmpi sgt, %hi_102, %c0_i32 : i32 loc(#loc1020) + %tmp7_105 = tt.load %tmp7, %vT_ptrs_104 : !tt.ptr loc(#loc860) + %tmp8 = tt.splat %tmp7_105 : i64 -> tensor<1x64xi64, #mma> loc(#loc861) + %tmp9 = arith.extsi %m : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc862) + %tmp10 = tt.splat %tmp7_105 : i64 -> tensor<128x1xi64, #mma> loc(#loc863) + %tmp10_106 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc863) + %tmp11 = tt.broadcast %tmp10_106 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc864) + %tmp15 = tt.splat %ks8 : i32 -> tensor<1x64xi32, #mma> loc(#loc865) + %tmp20 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc866) + %tmp21 = tt.splat %tmp20 : i1 -> tensor<1x64xi1, #mma> loc(#loc867) + %tmp29 = tt.splat %ks8 : i32 -> tensor<128x64xi32, #mma> loc(#loc868) + %tmp32 = tt.splat %tmp20 : i1 -> tensor<128x64xi1, #mma> loc(#loc869) + %p = tt.broadcast %lse_80 : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc870) + %ds = tt.expand_dims %Di_75 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc871) + %ds_107 = tt.broadcast %ds : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc872) + %kT_108 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1011) + %vT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1012) + %kT_109 = arith.cmpi slt, %kT_ptrs, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1008) + %kT_110 = tt.broadcast %kT_109 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1011) + %kT_111 = ttg.memdesc_index %kT_108[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %vT_ptrs_112 = tt.splat %vT_ptrs_104 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1020) + %vT_ptrs_113 = arith.andi %vT_ptrs_112, %kT_110 : tensor<128x64xi1, #blocked1> loc(#loc1020) + %kT_114 = ttg.async_copy_global_to_local %kT_ptrs_95, %kT_111 mask %vT_ptrs_113 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %kT_115 = ttg.async_commit_group tokens %kT_114 loc(#loc1011) + %vT_116 = ttg.memdesc_index %vT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_117 = ttg.async_copy_global_to_local %vT_ptrs_98, %vT_116 mask %vT_ptrs_113 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_118 = ttg.async_commit_group tokens %vT_117 loc(#loc1012) + %vT_ptrs_119 = arith.cmpi sgt, %hi_102, %c1_i32 : i32 loc(#loc1020) + %kT_ptrs_120 = tt.addptr %kT_ptrs_95, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc697) + %vT_ptrs_121 = tt.addptr %vT_ptrs_98, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc698) + %offs_n2_122 = arith.addi %offs_n2_87, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc699) + %kT_123 = tt.expand_dims %offs_n2_122 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1013) + %kT_124 = arith.cmpi slt, %kT_123, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1008) + %kT_125 = tt.broadcast %kT_124 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1011) + %kT_126 = ttg.memdesc_index %kT_108[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %vT_ptrs_127 = tt.splat %vT_ptrs_119 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1020) + %vT_ptrs_128 = arith.andi %vT_ptrs_127, %kT_125 : tensor<128x64xi1, #blocked1> loc(#loc1020) + %kT_129 = ttg.async_copy_global_to_local %kT_ptrs_120, %kT_126 mask %vT_ptrs_128 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %kT_130 = ttg.async_commit_group tokens %kT_129 loc(#loc1011) + %vT_131 = ttg.memdesc_index %vT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_132 = ttg.async_copy_global_to_local %vT_ptrs_121, %vT_131 mask %vT_ptrs_128 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_133 = ttg.async_commit_group tokens %vT_132 loc(#loc1012) + ttng.fence_async_shared {bCluster = false} loc(#loc874) + %vT_ptrs_134:12 = scf.for %vT_ptrs_190 = %c0_i32 to %hi_102 step %c1_i32 iter_args(%arg28 = %cst_7, %kT_ptrs_191 = %kT_ptrs_120, %offs_n2_192 = %offs_n2_122, %vT_ptrs_193 = %vT_ptrs_121, %offs_n2_194 = %offs_n2_86, %arg33 = %c1_i32, %arg34 = %c-1_i32, %kT_195 = %kT_115, %kT_196 = %kT_130, %vT_197 = %vT_118, %vT_198 = %vT_133, %arg39 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_199 = arith.subi %hi_102, %c2_i32 : i32 loc(#loc1020) + %vT_ptrs_200 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_199 : i32 loc(#loc1020) + %vT_ptrs_201 = arith.subi %hi_102, %c1_i32 : i32 loc(#loc1020) + %vT_ptrs_202 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_201 : i32 loc(#loc1020) + %vT_ptrs_203 = arith.addi %arg34, %c1_i32 : i32 loc(#loc1020) + %vT_ptrs_204 = arith.cmpi sge, %vT_ptrs_203, %c3_i32 : i32 loc(#loc1020) + %vT_ptrs_205 = arith.select %vT_ptrs_204, %c0_i32, %vT_ptrs_203 : i32 loc(#loc1020) + %kT_206 = tt.expand_dims %offs_n2_194 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc1013) + %kT_207 = arith.cmpi slt, %kT_206, %kT : tensor<1x64xi32, #mma> loc(#loc1008) + %kT_208 = tt.broadcast %kT_207 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc1011) + %kT_209 = ttg.async_wait %kT_195, %vT_197 {num = 2 : i32} loc(#loc1011) + %kT_210 = ttg.memdesc_index %kT_108[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %dq_211 = ttg.memdesc_trans %kT_210 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc875) + %qk = ttng.warp_group_dot %q_65, %kT_210, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc874) + %qk_212:3 = ttng.warp_group_dot_wait %qk, %q_65, %kT_210 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc874) + %qk_213 = arith.mulf %qk_212#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc876) + %n = arith.remsi %kT_206, %kT : tensor<1x64xi32, #mma> loc(#loc1014) + %post_mod_scores = arith.select %kT_208, %qk_213, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc878) + %tmp4_214 = tt.broadcast %n : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc857) + %tmp4_215 = arith.cmpi sge, %tmp4, %tmp4_214 : tensor<128x64xi32, #mma> loc(#loc857) + %tmp5 = arith.extsi %n : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc879) + %tmp8_216 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc861) + %tmp11_217 = tt.broadcast %tmp8_216 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc864) + %tmp11_218 = arith.andi %tmp11_217, %tmp11 : tensor<128x64xi1, #mma> loc(#loc864) + %tmp12 = arith.andi %tmp4_215, %tmp11_218 : tensor<128x64xi1, #mma> loc(#loc880) + %tmp15_219 = arith.cmpi sge, %n, %tmp15 : tensor<1x64xi32, #mma> loc(#loc865) + %tmp16 = arith.remsi %n, %tmp15 : tensor<1x64xi32, #mma> loc(#loc881) + %tmp18 = arith.cmpi ne, %tmp16, %cst_24 : tensor<1x64xi32, #mma> loc(#loc882) + %tmp19 = arith.cmpi slt, %tmp16, %cst_24 : tensor<1x64xi32, #mma> loc(#loc883) + %tmp21_220 = arith.cmpi ne, %tmp19, %tmp21 : tensor<1x64xi1, #mma> loc(#loc867) + %tmp22 = arith.andi %tmp18, %tmp21_220 : tensor<1x64xi1, #mma> loc(#loc884) + %tmp23 = arith.addi %tmp16, %tmp15 : tensor<1x64xi32, #mma> loc(#loc885) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc886) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc887) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc888) + %tmp27 = arith.andi %tmp15_219, %tmp26 : tensor<1x64xi1, #mma> loc(#loc889) + %tmp28 = arith.subi %tmp4_214, %tmp4 : tensor<128x64xi32, #mma> loc(#loc890) + %tmp29_221 = arith.remsi %tmp28, %tmp29 : tensor<128x64xi32, #mma> loc(#loc868) + %tmp30 = arith.cmpi ne, %tmp29_221, %cst_23 : tensor<128x64xi32, #mma> loc(#loc891) + %tmp31 = arith.cmpi slt, %tmp29_221, %cst_23 : tensor<128x64xi32, #mma> loc(#loc892) + %tmp32_222 = arith.cmpi ne, %tmp31, %tmp32 : tensor<128x64xi1, #mma> loc(#loc869) + %tmp33 = arith.andi %tmp30, %tmp32_222 : tensor<128x64xi1, #mma> loc(#loc893) + %tmp34 = arith.addi %tmp29_221, %tmp29 : tensor<128x64xi32, #mma> loc(#loc894) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29_221 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc895) + %tmp36 = arith.cmpi eq, %tmp35, %cst_23 : tensor<128x64xi32, #mma> loc(#loc896) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc897) + %tmp37_223 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc897) + %tmp38 = arith.ori %tmp12, %tmp37_223 : tensor<128x64xi1, #mma> loc(#loc898) + %post_mod_scores_224 = arith.select %tmp38, %post_mod_scores, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc899) + %post_mod_scores_225 = arith.mulf %post_mod_scores_224, %cst_11 : tensor<128x64xf32, #mma> loc(#loc900) + %p_226 = arith.subf %post_mod_scores_225, %p : tensor<128x64xf32, #mma> loc(#loc870) + %p_227 = math.exp2 %p_226 : tensor<128x64xf32, #mma> loc(#loc901) + %vT_228 = ttg.memdesc_index %vT[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %dp = ttng.warp_group_dot %do_71, %vT_228, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc902) + %dp_229:3 = ttng.warp_group_dot_wait %dp, %do_71, %vT_228 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc902) + %ds_230 = arith.subf %dp_229#0, %ds_107 : tensor<128x64xf32, #mma> loc(#loc872) + %ds_231 = arith.mulf %p_227, %ds_230 : tensor<128x64xf32, #mma> loc(#loc903) + %grad_scores = arith.select %kT_208, %ds_231, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc904) + %ds_232 = arith.select %tmp38, %grad_scores, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc905) + %ds_233 = arith.truncf %ds_232 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc906) + %ds_234 = ttg.convert_layout %ds_233 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc906) + %dq_235 = ttng.warp_group_dot %ds_234, %dq_211, %arg28 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc907) + %offs_n2_236 = tt.splat %arg39 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc699) + %offs_n2_237 = arith.addi %offs_n2_194, %offs_n2_236 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc699) + %vT_ptrs_238 = arith.addi %vT_ptrs_190, %c1_i32 : i32 loc(#loc1020) + %cur_block_idx = arith.divsi %vT_ptrs_238, %c2_i32 : i32 loc(#loc908) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc909) + %cur_block_239 = tt.load %cur_block, %vT_ptrs_202 evictionPolicy = evict_last : !tt.ptr loc(#loc910) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc911) + %next_block_240 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_82 : i32 loc(#loc912) + %next_block_241 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc913) + %vT_ptrs_242 = arith.andi %vT_ptrs_202, %next_block_240 : i1 loc(#loc1020) + %next_block_243 = tt.load %next_block_241, %vT_ptrs_242 evictionPolicy = evict_last : !tt.ptr loc(#loc914) + %needs_jump = arith.addi %vT_ptrs_190, %c2_i32 : i32 loc(#loc915) + %needs_jump_244 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc916) + %needs_jump_245 = arith.cmpi eq, %needs_jump_244, %c0_i32 : i32 loc(#loc917) + %jump_to_block = arith.subi %next_block_243, %cur_block_239 : i32 loc(#loc918) + %jump_to_block_246 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc919) + %jump_to_block_247 = arith.subi %jump_to_block_246, %c64_i32 : i32 loc(#loc920) + %offset = arith.extui %needs_jump_245 : i1 to i32 loc(#loc921) + %offset_248 = arith.muli %jump_to_block_247, %offset : i32 loc(#loc921) + %offset_249 = arith.subi %c1_i32, %offset : i32 loc(#loc922) + %offset_250 = arith.muli %offset_249, %c64_i32 : i32 loc(#loc923) + %offset_251 = arith.addi %offset_248, %offset_250 : i32 loc(#loc924) + %kT_ptrs_252 = arith.muli %offset_251, %c128_i32 : i32 loc(#loc701) + %kT_ptrs_253 = tt.splat %kT_ptrs_252 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc697) + %kT_ptrs_254 = tt.addptr %kT_ptrs_191, %kT_ptrs_253 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc697) + %vT_ptrs_255 = tt.addptr %vT_ptrs_193, %kT_ptrs_253 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc698) + %offs_n2_256 = tt.splat %offset_251 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc699) + %offs_n2_257 = arith.addi %offs_n2_192, %offs_n2_256 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc699) + %vT_ptrs_258 = arith.addi %arg33, %c1_i32 : i32 loc(#loc1020) + %vT_ptrs_259 = arith.cmpi sge, %vT_ptrs_258, %c3_i32 : i32 loc(#loc1020) + %vT_ptrs_260 = arith.select %vT_ptrs_259, %c0_i32, %vT_ptrs_258 : i32 loc(#loc1020) + %kT_261 = tt.expand_dims %offs_n2_257 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1013) + %kT_262 = arith.cmpi slt, %kT_261, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1008) + %kT_263 = tt.broadcast %kT_262 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1011) + %kT_264 = ttg.memdesc_index %kT_108[%vT_ptrs_260] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %vT_ptrs_265 = tt.splat %vT_ptrs_200 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1020) + %vT_ptrs_266 = arith.andi %vT_ptrs_265, %kT_263 : tensor<128x64xi1, #blocked1> loc(#loc1020) + %kT_267 = ttg.async_copy_global_to_local %kT_ptrs_254, %kT_264 mask %vT_ptrs_266 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %kT_268 = ttg.async_commit_group tokens %kT_267 loc(#loc1011) + %vT_269 = ttg.memdesc_index %vT[%vT_ptrs_260] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_270 = ttg.async_copy_global_to_local %vT_ptrs_255, %vT_269 mask %vT_ptrs_266 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_271 = ttg.async_commit_group tokens %vT_270 loc(#loc1012) + scf.yield %dq_235, %kT_ptrs_254, %offs_n2_257, %vT_ptrs_255, %offs_n2_237, %vT_ptrs_260, %vT_ptrs_205, %kT_196, %kT_268, %vT_198, %vT_271, %offset_251 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc1020) + } loc(#loc1020) + %vT_ptrs_135 = ttng.warp_group_dot_wait %vT_ptrs_134#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc1020) + %vT_ptrs_136 = ttg.async_wait {num = 0 : i32} loc(#loc1020) + ttg.local_dealloc %vT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1020) + ttg.local_dealloc %kT_108 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1020) + %kv_indices_137 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_39 : !tt.ptr, i32 loc(#loc522) + %kv_start_138 = tt.load %kv_indices_137 : !tt.ptr loc(#loc523) + %kv_start_139 = arith.muli %kv_start_138, %c128_i32 : i32 loc(#loc524) + %sparse_kv_num_blocks_140 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_37 : !tt.ptr, i32 loc(#loc525) + %sparse_kv_num_blocks_141 = tt.load %sparse_kv_num_blocks_140 : !tt.ptr loc(#loc526) + %offs_n2_142 = tt.splat %kv_start_139 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc527) + %offs_n2_143 = tt.splat %kv_start_139 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc527) + %offs_n2_144 = arith.addi %offs_n2_142, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc527) + %offs_n2_145 = arith.addi %offs_n2_143, %offs_n2_83 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc527) + %kT_ptrs_146 = tt.expand_dims %offs_n2_145 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc702) + %kT_ptrs_147 = arith.muli %kT_ptrs_146, %cst_20 : tensor<1x64xi32, #blocked1> loc(#loc703) + %kT_ptrs_148 = tt.addptr %kT_ptrs_89, %kT_ptrs_147 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc704) + %kT_ptrs_149 = tt.broadcast %kT_ptrs_148 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc705) + %kT_ptrs_150 = tt.addptr %kT_ptrs_149, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc705) + %vT_ptrs_151 = tt.addptr %vT_ptrs, %kT_ptrs_147 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc706) + %vT_ptrs_152 = tt.broadcast %vT_ptrs_151 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc707) + %vT_ptrs_153 = tt.addptr %vT_ptrs_152, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc707) + %hi_154 = arith.muli %sparse_kv_num_blocks_141, %c2_i32 : i32 loc(#loc708) + %hi_155 = arith.minsi %hi_154, %hi_101 : i32 loc(#loc709) + %kT_156 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1015) + %vT_157 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1016) + %vT_ptrs_158 = arith.cmpi sgt, %hi_155, %c0_i32 : i32 loc(#loc1021) + %kT_159 = arith.cmpi slt, %kT_ptrs_146, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1017) + %kT_160 = tt.broadcast %kT_159 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1015) + %kT_161 = ttg.memdesc_index %kT_156[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %vT_ptrs_162 = tt.splat %vT_ptrs_158 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1021) + %vT_ptrs_163 = arith.andi %vT_ptrs_162, %kT_160 : tensor<128x64xi1, #blocked1> loc(#loc1021) + %kT_164 = ttg.async_copy_global_to_local %kT_ptrs_150, %kT_161 mask %vT_ptrs_163 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %kT_165 = ttg.async_commit_group tokens %kT_164 loc(#loc1015) + %vT_166 = ttg.memdesc_index %vT_157[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_167 = ttg.async_copy_global_to_local %vT_ptrs_153, %vT_166 mask %vT_ptrs_163 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_168 = ttg.async_commit_group tokens %vT_167 loc(#loc1016) + %vT_ptrs_169 = arith.cmpi sgt, %hi_155, %c1_i32 : i32 loc(#loc1021) + %kT_ptrs_170 = tt.addptr %kT_ptrs_150, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc711) + %vT_ptrs_171 = tt.addptr %vT_ptrs_153, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc712) + %offs_n2_172 = arith.addi %offs_n2_145, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc713) + %kT_173 = tt.expand_dims %offs_n2_172 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1018) + %kT_174 = arith.cmpi slt, %kT_173, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1017) + %kT_175 = tt.broadcast %kT_174 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1015) + %kT_176 = ttg.memdesc_index %kT_156[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %vT_ptrs_177 = tt.splat %vT_ptrs_169 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1021) + %vT_ptrs_178 = arith.andi %vT_ptrs_177, %kT_175 : tensor<128x64xi1, #blocked1> loc(#loc1021) + %kT_179 = ttg.async_copy_global_to_local %kT_ptrs_170, %kT_176 mask %vT_ptrs_178 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %kT_180 = ttg.async_commit_group tokens %kT_179 loc(#loc1015) + %vT_181 = ttg.memdesc_index %vT_157[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_182 = ttg.async_copy_global_to_local %vT_ptrs_171, %vT_181 mask %vT_ptrs_178 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_183 = ttg.async_commit_group tokens %vT_182 loc(#loc1016) + ttng.fence_async_shared {bCluster = false} loc(#loc927) + %vT_ptrs_184:12 = scf.for %vT_ptrs_190 = %c0_i32 to %hi_155 step %c1_i32 iter_args(%vT_ptrs_191 = %vT_ptrs_135, %kT_ptrs_192 = %kT_ptrs_170, %offs_n2_193 = %offs_n2_172, %vT_ptrs_194 = %vT_ptrs_171, %offs_n2_195 = %offs_n2_144, %arg33 = %c1_i32, %arg34 = %c-1_i32, %kT_196 = %kT_165, %kT_197 = %kT_180, %vT_198 = %vT_168, %vT_199 = %vT_183, %arg39 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_200 = arith.subi %hi_155, %c2_i32 : i32 loc(#loc1021) + %vT_ptrs_201 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_200 : i32 loc(#loc1021) + %vT_ptrs_202 = arith.subi %hi_155, %c1_i32 : i32 loc(#loc1021) + %vT_ptrs_203 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_202 : i32 loc(#loc1021) + %vT_ptrs_204 = arith.addi %arg34, %c1_i32 : i32 loc(#loc1021) + %vT_ptrs_205 = arith.cmpi sge, %vT_ptrs_204, %c3_i32 : i32 loc(#loc1021) + %vT_ptrs_206 = arith.select %vT_ptrs_205, %c0_i32, %vT_ptrs_204 : i32 loc(#loc1021) + %kT_207 = tt.expand_dims %offs_n2_195 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc1018) + %kT_208 = arith.cmpi slt, %kT_207, %kT : tensor<1x64xi32, #mma> loc(#loc1017) + %kT_209 = tt.broadcast %kT_208 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc1015) + %kT_210 = ttg.async_wait %kT_196, %vT_198 {num = 2 : i32} loc(#loc1015) + %kT_211 = ttg.memdesc_index %kT_156[%vT_ptrs_206] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %dq_212 = ttg.memdesc_trans %kT_211 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc928) + %qk = ttng.warp_group_dot %q_65, %kT_211, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc927) + %qk_213:3 = ttng.warp_group_dot_wait %qk, %q_65, %kT_211 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc927) + %qk_214 = arith.mulf %qk_213#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc929) + %post_mod_scores = arith.select %kT_209, %qk_214, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc930) + %post_mod_scores_215 = arith.mulf %post_mod_scores, %cst_11 : tensor<128x64xf32, #mma> loc(#loc931) + %p_216 = arith.subf %post_mod_scores_215, %p : tensor<128x64xf32, #mma> loc(#loc932) + %p_217 = math.exp2 %p_216 : tensor<128x64xf32, #mma> loc(#loc933) + %vT_218 = ttg.memdesc_index %vT_157[%vT_ptrs_206] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %dp = ttng.warp_group_dot %do_71, %vT_218, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc934) + %dp_219:3 = ttng.warp_group_dot_wait %dp, %do_71, %vT_218 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc934) + %ds_220 = arith.subf %dp_219#0, %ds_107 : tensor<128x64xf32, #mma> loc(#loc935) + %ds_221 = arith.mulf %p_217, %ds_220 : tensor<128x64xf32, #mma> loc(#loc936) + %grad_scores = arith.select %kT_209, %ds_221, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc937) + %ds_222 = arith.truncf %grad_scores : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc938) + %ds_223 = ttg.convert_layout %ds_222 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc938) + %dq_224 = ttng.warp_group_dot %ds_223, %dq_212, %vT_ptrs_191 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc939) + %offs_n2_225 = tt.splat %arg39 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc713) + %offs_n2_226 = arith.addi %offs_n2_195, %offs_n2_225 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc713) + %vT_ptrs_227 = arith.addi %vT_ptrs_190, %c1_i32 : i32 loc(#loc1021) + %cur_block_idx = arith.divsi %vT_ptrs_227, %c2_i32 : i32 loc(#loc940) + %cur_block = tt.addptr %kv_indices_137, %cur_block_idx : !tt.ptr, i32 loc(#loc941) + %cur_block_228 = tt.load %cur_block, %vT_ptrs_203 evictionPolicy = evict_last : !tt.ptr loc(#loc942) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc943) + %next_block_229 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_141 : i32 loc(#loc944) + %next_block_230 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc945) + %vT_ptrs_231 = arith.andi %vT_ptrs_203, %next_block_229 : i1 loc(#loc1021) + %next_block_232 = tt.load %next_block_230, %vT_ptrs_231 evictionPolicy = evict_last : !tt.ptr loc(#loc946) + %needs_jump = arith.addi %vT_ptrs_190, %c2_i32 : i32 loc(#loc947) + %needs_jump_233 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc948) + %needs_jump_234 = arith.cmpi eq, %needs_jump_233, %c0_i32 : i32 loc(#loc949) + %jump_to_block = arith.subi %next_block_232, %cur_block_228 : i32 loc(#loc950) + %jump_to_block_235 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc951) + %jump_to_block_236 = arith.subi %jump_to_block_235, %c64_i32 : i32 loc(#loc952) + %offset = arith.extui %needs_jump_234 : i1 to i32 loc(#loc953) + %offset_237 = arith.muli %jump_to_block_236, %offset : i32 loc(#loc953) + %offset_238 = arith.subi %c1_i32, %offset : i32 loc(#loc954) + %offset_239 = arith.muli %offset_238, %c64_i32 : i32 loc(#loc955) + %offset_240 = arith.addi %offset_237, %offset_239 : i32 loc(#loc956) + %kT_ptrs_241 = arith.muli %offset_240, %c128_i32 : i32 loc(#loc715) + %kT_ptrs_242 = tt.splat %kT_ptrs_241 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc711) + %kT_ptrs_243 = tt.addptr %kT_ptrs_192, %kT_ptrs_242 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc711) + %vT_ptrs_244 = tt.addptr %vT_ptrs_194, %kT_ptrs_242 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc712) + %offs_n2_245 = tt.splat %offset_240 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc713) + %offs_n2_246 = arith.addi %offs_n2_193, %offs_n2_245 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc713) + %vT_ptrs_247 = arith.addi %arg33, %c1_i32 : i32 loc(#loc1021) + %vT_ptrs_248 = arith.cmpi sge, %vT_ptrs_247, %c3_i32 : i32 loc(#loc1021) + %vT_ptrs_249 = arith.select %vT_ptrs_248, %c0_i32, %vT_ptrs_247 : i32 loc(#loc1021) + %kT_250 = tt.expand_dims %offs_n2_246 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1018) + %kT_251 = arith.cmpi slt, %kT_250, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1017) + %kT_252 = tt.broadcast %kT_251 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1015) + %kT_253 = ttg.memdesc_index %kT_156[%vT_ptrs_249] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %vT_ptrs_254 = tt.splat %vT_ptrs_201 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1021) + %vT_ptrs_255 = arith.andi %vT_ptrs_254, %kT_252 : tensor<128x64xi1, #blocked1> loc(#loc1021) + %kT_256 = ttg.async_copy_global_to_local %kT_ptrs_243, %kT_253 mask %vT_ptrs_255 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %kT_257 = ttg.async_commit_group tokens %kT_256 loc(#loc1015) + %vT_258 = ttg.memdesc_index %vT_157[%vT_ptrs_249] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_259 = ttg.async_copy_global_to_local %vT_ptrs_244, %vT_258 mask %vT_ptrs_255 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_260 = ttg.async_commit_group tokens %vT_259 loc(#loc1016) + scf.yield %dq_224, %kT_ptrs_243, %offs_n2_246, %vT_ptrs_244, %offs_n2_226, %vT_ptrs_249, %vT_ptrs_206, %kT_197, %kT_257, %vT_199, %vT_260, %offset_240 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc1021) + } loc(#loc1021) + %vT_ptrs_185 = ttng.warp_group_dot_wait %vT_ptrs_184#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc1021) + %vT_ptrs_186 = ttg.async_wait {num = 0 : i32} loc(#loc1021) + ttg.local_dealloc %vT_157 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1021) + ttg.local_dealloc %kT_156 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1021) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc529) + %dq_ptrs_187 = tt.addptr %dq_ptrs, %ptr_53 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc529) + %dq_ptrs_188 = tt.broadcast %dq_ptrs_187 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc530) + %dq_ptrs_189 = tt.addptr %dq_ptrs_188, %ptr_59 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc530) + %dq = arith.mulf %vT_ptrs_185, %cst_6 : tensor<128x128xf32, #mma1> loc(#loc531) + %12 = arith.cmpi slt, %ptr_57, %cst_1 : tensor<1x128xi32, #blocked> loc(#loc189) + %13 = tt.broadcast %12 : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc190) + %14 = arith.andi %q_63, %13 : tensor<128x128xi1, #blocked> loc(#loc190) + %15 = arith.truncf %dq : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc191) + %16 = ttg.convert_layout %15 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc191) + tt.store %dq_ptrs_189, %16, %14 : tensor<128x128x!tt.ptr, #blocked> loc(#loc191) + } else { + %stride_q_idx_h = arith.muli %ks6, %ks7 : i32 loc(#loc532) + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc533) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc534) + %offs_n1_35 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc534) + %offs_n1_36 = arith.addi %offs_n1, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc534) + %offs_n1_37 = arith.addi %offs_n1_35, %offs_k_34 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc534) + %ptr = tt.expand_dims %offs_n1_36 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc716) + %ptr_38 = tt.expand_dims %offs_n1_37 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc716) + %ptr_39 = arith.muli %ptr, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc717) + %ptr_40 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc718) + %ptr_41 = tt.addptr %ptr_40, %ptr_39 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc718) + %ptr_42 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc719) + %ptr_43 = tt.expand_dims %ptr_42 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc719) + %ptr_44 = tt.broadcast %ptr_41 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc720) + %ptr_45 = tt.broadcast %ptr_43 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc720) + %ptr_46 = tt.addptr %ptr_44, %ptr_45 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc720) + %k = tt.splat %ks1 : i32 -> tensor<128x1xi32, #blocked> loc(#loc721) + %k_47 = tt.splat %ks1 : i32 -> tensor<128x1xi32, #mma> loc(#loc721) + %k_48 = arith.cmpi slt, %ptr, %k : tensor<128x1xi32, #blocked> loc(#loc721) + %k_49 = tt.broadcast %k_48 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc722) + %k_50 = tt.load %ptr_46, %k_49, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc722) + %k_51 = ttg.local_alloc %k_50 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc722) + %ptr_52 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc723) + %ptr_53 = tt.addptr %ptr_52, %ptr_39 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc723) + %ptr_54 = tt.broadcast %ptr_53 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc724) + %ptr_55 = tt.addptr %ptr_54, %ptr_45 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc724) + %v = tt.load %ptr_55, %k_49, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc725) + %v_56 = ttg.local_alloc %v : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc725) + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc537) + %q_adj1 = arith.muli %0, %off_zq : i32 loc(#loc538) + %do_adj1 = arith.muli %9, %off_zq : i32 loc(#loc539) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc540) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %ks5 : i32 loc(#loc541) + %sparse_q_num_blks_offset_57 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc542) + %sparse_q_idx_offset = arith.muli %off_zkv, %stride_q_idx_h : i32 loc(#loc543) + %sparse_q_idx_offset_58 = arith.muli %pid, %ks6 : i32 loc(#loc544) + %sparse_q_idx_offset_59 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_58 : i32 loc(#loc545) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_59 : !tt.ptr, i32 loc(#loc546) + %q_start = tt.load %q_indices, %true : !tt.ptr loc(#loc547) + %q_start_60 = arith.muli %q_start, %c128_i32 : i32 loc(#loc548) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_57 : !tt.ptr, i32 loc(#loc549) + %sparse_q_num_blocks_61 = tt.load %sparse_q_num_blocks, %true : !tt.ptr loc(#loc550) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc551) + %offs_m1_62 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc551) + %offs_m1_63 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc551) + %offs_m1_64 = tt.splat %q_start_60 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc552) + %offs_m1_65 = tt.splat %q_start_60 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc552) + %offs_m1_66 = tt.splat %q_start_60 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc552) + %offs_m1_67 = arith.addi %offs_m1_64, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc552) + %offs_m1_68 = arith.addi %offs_m1_65, %offs_m1_62 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc552) + %offs_m1_69 = arith.addi %offs_m1_66, %offs_m1_63 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc552) + %qT_ptrs = tt.expand_dims %offs_m1_68 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc726) + %qT_ptrs_70 = arith.muli %qT_ptrs, %cst_19 : tensor<1x64xi32, #blocked1> loc(#loc727) + %qT_ptrs_71 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc728) + %qT_ptrs_72 = tt.expand_dims %qT_ptrs_71 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc728) + %qT_ptrs_73 = tt.broadcast %qT_ptrs_72 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc729) + %do_ptrs = tt.expand_dims %offs_m1_69 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc730) + %do_ptrs_74 = arith.muli %do_ptrs, %cst_18 : tensor<64x1xi32, #blocked> loc(#loc731) + %do_ptrs_75 = tt.broadcast %ptr_43 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc732) + %hi = arith.muli %sparse_q_num_blocks_61, %c2_i32 : i32 loc(#loc733) + %hi_76 = arith.addi %ks0, %c63_i32 : i32 loc(#loc957) + %hi_77 = arith.divsi %hi_76, %c64_i32 : i32 loc(#loc958) + %hi_78 = arith.maxsi %hi_77, %c1_i32 : i32 loc(#loc735) + %hi_79 = arith.minsi %hi, %hi_78 : i32 loc(#loc736) + %qT = tt.splat %ks0 : i32 -> tensor<1x64xi32, #mma> loc(#loc959) + %qT_80 = tt.splat %ks0 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc959) + %lse = tt.splat %ks0 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %n = arith.remsi %ptr_38, %k_47 : tensor<128x1xi32, #mma> loc(#loc960) + %tmp44 = tt.broadcast %n : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc740) + %tmp45 = arith.extsi %n : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc741) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc742) + %do_ptrs_81 = arith.cmpi sgt, %hi_79, %c0_i32 : i32 loc(#loc1023) + %tmp47_82 = tt.load %tmp47, %do_ptrs_81 : !tt.ptr loc(#loc744) + %tmp48 = tt.splat %tmp47_82 : i64 -> tensor<128x1xi64, #mma> loc(#loc745) + %tmp48_83 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64, #mma> loc(#loc745) + %tmp50 = tt.splat %tmp47_82 : i64 -> tensor<1x64xi64, #mma> loc(#loc746) + %tmp51 = tt.broadcast %tmp48_83 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc747) + %tmp55 = tt.splat %ks8 : i32 -> tensor<128x1xi32, #mma> loc(#loc748) + %tmp55_84 = arith.cmpi sge, %n, %tmp55 : tensor<128x1xi32, #mma> loc(#loc748) + %tmp56 = arith.remsi %n, %tmp55 : tensor<128x1xi32, #mma> loc(#loc749) + %tmp58 = arith.cmpi ne, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc750) + %tmp59 = arith.cmpi slt, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc751) + %tmp60 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc752) + %tmp61 = tt.splat %tmp60 : i1 -> tensor<128x1xi1, #mma> loc(#loc753) + %tmp61_85 = arith.cmpi ne, %tmp59, %tmp61 : tensor<128x1xi1, #mma> loc(#loc753) + %tmp62 = arith.andi %tmp58, %tmp61_85 : tensor<128x1xi1, #mma> loc(#loc754) + %tmp63 = arith.addi %tmp56, %tmp55 : tensor<128x1xi32, #mma> loc(#loc755) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc756) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc757) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64, #mma> loc(#loc758) + %tmp67 = arith.andi %tmp55_84, %tmp66 : tensor<128x1xi1, #mma> loc(#loc759) + %tmp69 = tt.splat %ks8 : i32 -> tensor<128x64xi32, #mma> loc(#loc760) + %tmp72 = tt.splat %tmp60 : i1 -> tensor<128x64xi1, #mma> loc(#loc761) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc762) + %do = tt.splat %ks0 : i32 -> tensor<64x1xi32, #blocked> loc(#loc962) + %q_indices_86 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_59 : !tt.ptr, i32 loc(#loc592) + %q_start_87 = tt.load %q_indices_86, %true : !tt.ptr loc(#loc593) + %q_start_88 = arith.muli %q_start_87, %c128_i32 : i32 loc(#loc594) + %sparse_q_num_blocks_89 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_57 : !tt.ptr, i32 loc(#loc595) + %sparse_q_num_blocks_90 = tt.load %sparse_q_num_blocks_89, %true : !tt.ptr loc(#loc596) + %offs_m1_91 = tt.splat %q_start_88 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc597) + %offs_m1_92 = tt.splat %q_start_88 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc597) + %offs_m1_93 = tt.splat %q_start_88 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc597) + %offs_m1_94 = arith.addi %offs_m1_91, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc597) + %offs_m1_95 = arith.addi %offs_m1_92, %offs_m1_62 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc597) + %offs_m1_96 = arith.addi %offs_m1_93, %offs_m1_63 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc597) + %qT_ptrs_97 = tt.expand_dims %offs_m1_95 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc764) + %qT_ptrs_98 = arith.muli %qT_ptrs_97, %cst_19 : tensor<1x64xi32, #blocked1> loc(#loc765) + %do_ptrs_99 = tt.expand_dims %offs_m1_96 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc766) + %do_ptrs_100 = arith.muli %do_ptrs_99, %cst_18 : tensor<64x1xi32, #blocked> loc(#loc767) + %hi_101 = arith.muli %sparse_q_num_blocks_90, %c2_i32 : i32 loc(#loc768) + %hi_102 = arith.minsi %hi_101, %hi_78 : i32 loc(#loc769) + ttng.fence_async_shared {bCluster = false} loc(#loc770) + %dk:2 = scf.for %dk_116 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg28 = %cst_7, %arg29 = %cst_7) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>) : i32 { + %off_hq1_117 = arith.addi %off_hq1, %dk_116 : i32 loc(#loc600) + %q_adj1_118 = arith.muli %off_hq1_117, %c128_i32 : i32 loc(#loc601) + %q_adj1_119 = arith.addi %q_adj1_118, %q_adj1 : i32 loc(#loc602) + %q_adj1_120 = arith.extsi %q_adj1_119 : i32 to i64 loc(#loc603) + %do_adj1_121 = arith.muli %10, %off_hq1_117 : i32 loc(#loc604) + %do_adj1_122 = arith.addi %do_adj1_121, %do_adj1 : i32 loc(#loc605) + %do_adj1_123 = arith.extsi %do_adj1_122 : i32 to i64 loc(#loc606) + %off_chz1_124 = arith.addi %off_chz1, %off_hq1_117 : i32 loc(#loc607) + %off_chz1_125 = arith.muli %off_chz1_124, %ks0 : i32 loc(#loc608) + %off_chz1_126 = arith.extsi %off_chz1_125 : i32 to i64 loc(#loc609) + %Q1 = tt.addptr %arg_Q, %q_adj1_120 : !tt.ptr, i64 loc(#loc610) + %DO1 = tt.addptr %arg_DO, %do_adj1_123 : !tt.ptr, i64 loc(#loc611) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_126 : !tt.ptr, i64 loc(#loc612) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_126 : !tt.ptr, i64 loc(#loc613) + %qT_ptrs_127 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc772) + %qT_ptrs_128 = tt.addptr %qT_ptrs_127, %qT_ptrs_70 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc772) + %qT_ptrs_129 = tt.broadcast %qT_ptrs_128 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc729) + %qT_ptrs_130 = tt.addptr %qT_ptrs_129, %qT_ptrs_73 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc729) + %do_ptrs_131 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc773) + %do_ptrs_132 = tt.addptr %do_ptrs_131, %do_ptrs_74 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc773) + %do_ptrs_133 = tt.broadcast %do_ptrs_132 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc732) + %do_ptrs_134 = tt.addptr %do_ptrs_133, %do_ptrs_75 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc732) + %lse_135 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %qT_136 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc963) + %lse_137 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc776) + %do_138 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc964) + %Di_139 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc777) + %qT_140 = arith.cmpi slt, %qT_ptrs, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc959) + %qT_141 = tt.broadcast %qT_140 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc963) + %qT_142 = ttg.memdesc_index %qT_136[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %do_ptrs_143 = tt.splat %do_ptrs_81 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1023) + %do_ptrs_144 = arith.andi %do_ptrs_143, %qT_141 : tensor<128x64xi1, #blocked1> loc(#loc1023) + %qT_145 = ttg.async_copy_global_to_local %qT_ptrs_130, %qT_142 mask %do_ptrs_144 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %qT_146 = ttg.async_commit_group tokens %qT_145 loc(#loc963) + %lse_147 = arith.cmpi slt, %offs_m1_67, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %lse_148 = tt.addptr %lse_135, %offs_m1_67 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %lse_149 = ttg.memdesc_index %lse_137[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %do_ptrs_150 = tt.splat %do_ptrs_81 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %do_ptrs_151 = arith.andi %do_ptrs_150, %lse_147 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %lse_152 = ttg.async_copy_global_to_local %lse_148, %lse_149 mask %do_ptrs_151 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_153 = ttg.async_commit_group tokens %lse_152 loc(#loc776) + %do_154 = arith.cmpi slt, %do_ptrs, %do : tensor<64x1xi32, #blocked> loc(#loc962) + %do_155 = tt.broadcast %do_154 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc964) + %do_156 = ttg.memdesc_index %do_138[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_ptrs_157 = tt.splat %do_ptrs_81 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1023) + %do_ptrs_158 = arith.andi %do_ptrs_157, %do_155 : tensor<64x128xi1, #blocked> loc(#loc1023) + %do_159 = ttg.async_copy_global_to_local %do_ptrs_134, %do_156 mask %do_ptrs_158 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_160 = ttg.async_commit_group tokens %do_159 loc(#loc964) + %Di_161 = tt.addptr %Di, %offs_m1_67 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %Di_162 = ttg.memdesc_index %Di_139[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_163 = ttg.async_copy_global_to_local %Di_161, %Di_162 mask %do_ptrs_151 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_164 = ttg.async_commit_group tokens %Di_163 loc(#loc777) + %do_ptrs_165 = arith.cmpi sgt, %hi_79, %c1_i32 : i32 loc(#loc1023) + %qT_ptrs_166 = tt.addptr %qT_ptrs_130, %cst_14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc778) + %do_ptrs_167 = tt.addptr %do_ptrs_134, %cst_15 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc779) + %offs_m1_168 = arith.addi %offs_m1_67, %cst_16 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_169 = arith.addi %offs_m1_68, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc780) + %offs_m1_170 = arith.addi %offs_m1_69, %cst_17 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc780) + %qT_171 = tt.expand_dims %offs_m1_169 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc965) + %qT_172 = arith.cmpi slt, %qT_171, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc959) + %qT_173 = tt.broadcast %qT_172 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc963) + %qT_174 = ttg.memdesc_index %qT_136[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %do_ptrs_175 = tt.splat %do_ptrs_165 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1023) + %do_ptrs_176 = arith.andi %do_ptrs_175, %qT_173 : tensor<128x64xi1, #blocked1> loc(#loc1023) + %qT_177 = ttg.async_copy_global_to_local %qT_ptrs_166, %qT_174 mask %do_ptrs_176 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %qT_178 = ttg.async_commit_group tokens %qT_177 loc(#loc963) + %lse_179 = arith.cmpi slt, %offs_m1_168, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %lse_180 = tt.addptr %lse_135, %offs_m1_168 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %lse_181 = ttg.memdesc_index %lse_137[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %do_ptrs_182 = tt.splat %do_ptrs_165 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %do_ptrs_183 = arith.andi %do_ptrs_182, %lse_179 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %lse_184 = ttg.async_copy_global_to_local %lse_180, %lse_181 mask %do_ptrs_183 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_185 = ttg.async_commit_group tokens %lse_184 loc(#loc776) + %do_186 = tt.expand_dims %offs_m1_170 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc966) + %do_187 = arith.cmpi slt, %do_186, %do : tensor<64x1xi32, #blocked> loc(#loc962) + %do_188 = tt.broadcast %do_187 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc964) + %do_189 = ttg.memdesc_index %do_138[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_ptrs_190 = tt.splat %do_ptrs_165 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1023) + %do_ptrs_191 = arith.andi %do_ptrs_190, %do_188 : tensor<64x128xi1, #blocked> loc(#loc1023) + %do_192 = ttg.async_copy_global_to_local %do_ptrs_167, %do_189 mask %do_ptrs_191 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_193 = ttg.async_commit_group tokens %do_192 loc(#loc964) + %Di_194 = tt.addptr %Di, %offs_m1_168 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %Di_195 = ttg.memdesc_index %Di_139[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_196 = ttg.async_copy_global_to_local %Di_194, %Di_195 mask %do_ptrs_183 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_197 = ttg.async_commit_group tokens %Di_196 loc(#loc777) + %do_ptrs_198:22 = scf.for %do_ptrs_273 = %c0_i32 to %hi_79 step %c1_i32 iter_args(%arg31 = %arg29, %arg32 = %arg28, %qT_ptrs_274 = %qT_ptrs_166, %offs_m1_275 = %offs_m1_169, %do_ptrs_276 = %do_ptrs_167, %offs_m1_277 = %offs_m1_170, %offs_m1_278 = %offs_m1_168, %arg38 = %c1_i32, %arg39 = %c-1_i32, %arg40 = %c1_i32, %arg41 = %c-1_i32, %offs_m1_279 = %offs_m1_67, %qT_280 = %qT_146, %qT_281 = %qT_178, %lse_282 = %lse_153, %lse_283 = %lse_185, %do_284 = %do_160, %do_285 = %do_193, %Di_286 = %Di_164, %Di_287 = %Di_197, %arg51 = %c64_i32, %offs_m1_288 = %offs_m1_67) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>) : i32 { + %do_ptrs_289 = arith.subi %hi_79, %c2_i32 : i32 loc(#loc1023) + %do_ptrs_290 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_289 : i32 loc(#loc1023) + %do_ptrs_291 = arith.subi %hi_79, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_292 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_291 : i32 loc(#loc1023) + %do_ptrs_293 = arith.addi %arg41, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_294 = arith.cmpi sge, %do_ptrs_293, %c2_i32 : i32 loc(#loc1023) + %do_ptrs_295 = arith.select %do_ptrs_294, %c0_i32, %do_ptrs_293 : i32 loc(#loc1023) + %do_ptrs_296 = arith.addi %arg39, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_297 = arith.cmpi sge, %do_ptrs_296, %c3_i32 : i32 loc(#loc1023) + %do_ptrs_298 = arith.select %do_ptrs_297, %c0_i32, %do_ptrs_296 : i32 loc(#loc1023) + %qT_299 = tt.expand_dims %offs_m1_279 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc965) + %qT_300 = tt.expand_dims %offs_m1_288 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc965) + %qT_301 = arith.cmpi slt, %qT_299, %qT : tensor<1x64xi32, #mma> loc(#loc959) + %qT_302 = tt.broadcast %qT_301 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc963) + %qT_303 = ttg.async_wait %qT_280, %lse_282, %do_284, %Di_286 {num = 4 : i32} loc(#loc963) + %qT_304 = ttg.memdesc_index %qT_136[%do_ptrs_298] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %dk_305 = ttg.memdesc_trans %qT_304 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc781) + %lse_306 = ttg.memdesc_index %lse_137[%do_ptrs_295] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_307 = ttg.local_load %lse_306 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc776) + %lse_308 = arith.cmpf oeq, %lse_307, %cst_21 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc782) + %lse_309 = arith.select %lse_308, %cst_22, %lse_307 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc783) + %qkT = ttng.warp_group_dot %k_51, %qT_304, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc770) + %qkT_310:3 = ttng.warp_group_dot_wait %qkT, %k_51, %qT_304 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc770) + %qkT_311 = arith.mulf %qkT_310#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc784) + %m = arith.remsi %qT_300, %qT : tensor<1x64xi32, #mma> loc(#loc967) + %post_mod_scores = arith.select %qT_302, %qkT_311, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc786) + %tmp44_312 = tt.broadcast %m : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc740) + %tmp44_313 = arith.cmpi sge, %tmp44_312, %tmp44 : tensor<128x64xi32, #mma> loc(#loc740) + %tmp49 = arith.extsi %m : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc787) + %tmp50_314 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64, #mma> loc(#loc746) + %tmp51_315 = tt.broadcast %tmp50_314 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc747) + %tmp51_316 = arith.andi %tmp51, %tmp51_315 : tensor<128x64xi1, #mma> loc(#loc747) + %tmp52 = arith.andi %tmp44_313, %tmp51_316 : tensor<128x64xi1, #mma> loc(#loc788) + %tmp68 = arith.subi %tmp44, %tmp44_312 : tensor<128x64xi32, #mma> loc(#loc789) + %tmp69_317 = arith.remsi %tmp68, %tmp69 : tensor<128x64xi32, #mma> loc(#loc760) + %tmp70 = arith.cmpi ne, %tmp69_317, %cst_23 : tensor<128x64xi32, #mma> loc(#loc790) + %tmp71 = arith.cmpi slt, %tmp69_317, %cst_23 : tensor<128x64xi32, #mma> loc(#loc791) + %tmp72_318 = arith.cmpi ne, %tmp71, %tmp72 : tensor<128x64xi1, #mma> loc(#loc761) + %tmp73 = arith.andi %tmp70, %tmp72_318 : tensor<128x64xi1, #mma> loc(#loc792) + %tmp74 = arith.addi %tmp69_317, %tmp69 : tensor<128x64xi32, #mma> loc(#loc793) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69_317 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc794) + %tmp76 = arith.cmpi eq, %tmp75, %cst_23 : tensor<128x64xi32, #mma> loc(#loc795) + %tmp77_319 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1, #mma> loc(#loc762) + %tmp78 = arith.ori %tmp52, %tmp77_319 : tensor<128x64xi1, #mma> loc(#loc796) + %post_mod_scores_320 = arith.select %tmp78, %post_mod_scores, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc797) + %post_mod_scores_321 = arith.mulf %post_mod_scores_320, %cst_11 : tensor<128x64xf32, #mma> loc(#loc798) + %pT = tt.expand_dims %lse_309 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc799) + %pT_322 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc800) + %pT_323 = arith.subf %post_mod_scores_321, %pT_322 : tensor<128x64xf32, #mma> loc(#loc800) + %pT_324 = math.exp2 %pT_323 : tensor<128x64xf32, #mma> loc(#loc801) + %do_325 = ttg.memdesc_index %do_138[%do_ptrs_298] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %dpT = ttg.memdesc_trans %do_325 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc802) + %dv = arith.truncf %pT_324 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc803) + %dv_326 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc803) + %dv_327 = ttng.warp_group_dot %dv_326, %do_325, %arg32 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc804) + %Di_328 = ttg.memdesc_index %Di_139[%do_ptrs_295] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_329 = ttg.local_load %Di_328 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc777) + %dpT_330 = ttng.warp_group_dot %v_56, %dpT, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc805) + %dpT_331:3 = ttng.warp_group_dot_wait %dpT_330, %v_56, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc805) + %dsT = tt.expand_dims %Di_329 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc806) + %dsT_332 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc807) + %dsT_333 = arith.subf %dpT_331#0, %dsT_332 : tensor<128x64xf32, #mma> loc(#loc807) + %dsT_334 = arith.mulf %pT_324, %dsT_333 : tensor<128x64xf32, #mma> loc(#loc808) + %grad_scores = arith.select %qT_302, %dsT_334, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc809) + %dsT_335 = arith.select %tmp78, %grad_scores, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc810) + %dk_336 = arith.truncf %dsT_335 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc811) + %dk_337 = ttg.convert_layout %dk_336 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc811) + %dk_338 = ttng.warp_group_dot %dk_337, %dk_305, %arg31 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc812) + %offs_m1_339 = tt.splat %arg51 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_340 = arith.addi %offs_m1_288, %offs_m1_339 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %do_ptrs_341 = arith.addi %do_ptrs_273, %c1_i32 : i32 loc(#loc1023) + %cur_block_idx = arith.divsi %do_ptrs_341, %c2_i32 : i32 loc(#loc968) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc969) + %cur_block_342 = tt.load %cur_block, %do_ptrs_292 evictionPolicy = evict_last : !tt.ptr loc(#loc970) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc971) + %next_block_343 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_61 : i32 loc(#loc972) + %next_block_344 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc973) + %do_ptrs_345 = arith.andi %do_ptrs_292, %next_block_343 : i1 loc(#loc1023) + %next_block_346 = tt.load %next_block_344, %do_ptrs_345 evictionPolicy = evict_last : !tt.ptr loc(#loc974) + %needs_jump = arith.addi %do_ptrs_273, %c2_i32 : i32 loc(#loc975) + %needs_jump_347 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc976) + %needs_jump_348 = arith.cmpi eq, %needs_jump_347, %c0_i32 : i32 loc(#loc977) + %jump_to_block = arith.subi %next_block_346, %cur_block_342 : i32 loc(#loc978) + %jump_to_block_349 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc979) + %jump_to_block_350 = arith.subi %jump_to_block_349, %c64_i32 : i32 loc(#loc980) + %offset = arith.extui %needs_jump_348 : i1 to i32 loc(#loc981) + %offset_351 = arith.muli %jump_to_block_350, %offset : i32 loc(#loc981) + %offset_352 = arith.subi %c1_i32, %offset : i32 loc(#loc982) + %offset_353 = arith.muli %offset_352, %c64_i32 : i32 loc(#loc983) + %offset_354 = arith.addi %offset_351, %offset_353 : i32 loc(#loc984) + %qT_ptrs_355 = arith.muli %offset_354, %c4096_i32 : i32 loc(#loc814) + %qT_ptrs_356 = tt.splat %qT_ptrs_355 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc778) + %qT_ptrs_357 = tt.addptr %qT_ptrs_274, %qT_ptrs_356 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc778) + %do_ptrs_358 = arith.muli %offset_354, %c128_i32 : i32 loc(#loc815) + %do_ptrs_359 = tt.splat %do_ptrs_358 : i32 -> tensor<64x128xi32, #blocked> loc(#loc779) + %do_ptrs_360 = tt.addptr %do_ptrs_276, %do_ptrs_359 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc779) + %offs_m1_361 = tt.splat %offset_354 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_362 = tt.splat %offset_354 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc780) + %offs_m1_363 = tt.splat %offset_354 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc780) + %offs_m1_364 = arith.addi %offs_m1_278, %offs_m1_361 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_365 = arith.addi %offs_m1_275, %offs_m1_362 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc780) + %offs_m1_366 = arith.addi %offs_m1_277, %offs_m1_363 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc780) + %do_ptrs_367 = arith.addi %arg40, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_368 = arith.cmpi sge, %do_ptrs_367, %c2_i32 : i32 loc(#loc1023) + %do_ptrs_369 = arith.select %do_ptrs_368, %c0_i32, %do_ptrs_367 : i32 loc(#loc1023) + %do_ptrs_370 = arith.addi %arg38, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_371 = arith.cmpi sge, %do_ptrs_370, %c3_i32 : i32 loc(#loc1023) + %do_ptrs_372 = arith.select %do_ptrs_371, %c0_i32, %do_ptrs_370 : i32 loc(#loc1023) + %qT_373 = tt.expand_dims %offs_m1_365 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc965) + %qT_374 = arith.cmpi slt, %qT_373, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc959) + %qT_375 = tt.broadcast %qT_374 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc963) + %qT_376 = ttg.memdesc_index %qT_136[%do_ptrs_372] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %do_ptrs_377 = tt.splat %do_ptrs_290 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1023) + %do_ptrs_378 = arith.andi %do_ptrs_377, %qT_375 : tensor<128x64xi1, #blocked1> loc(#loc1023) + %qT_379 = ttg.async_copy_global_to_local %qT_ptrs_357, %qT_376 mask %do_ptrs_378 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %qT_380 = ttg.async_commit_group tokens %qT_379 loc(#loc963) + %lse_381 = arith.cmpi slt, %offs_m1_364, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %lse_382 = tt.addptr %lse_135, %offs_m1_364 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %lse_383 = ttg.memdesc_index %lse_137[%do_ptrs_369] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %do_ptrs_384 = tt.splat %do_ptrs_290 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %do_ptrs_385 = arith.andi %do_ptrs_384, %lse_381 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %lse_386 = ttg.async_copy_global_to_local %lse_382, %lse_383 mask %do_ptrs_385 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_387 = ttg.async_commit_group tokens %lse_386 loc(#loc776) + %do_388 = tt.expand_dims %offs_m1_366 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc966) + %do_389 = arith.cmpi slt, %do_388, %do : tensor<64x1xi32, #blocked> loc(#loc962) + %do_390 = tt.broadcast %do_389 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc964) + %do_391 = ttg.memdesc_index %do_138[%do_ptrs_372] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_ptrs_392 = tt.splat %do_ptrs_290 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1023) + %do_ptrs_393 = arith.andi %do_ptrs_392, %do_390 : tensor<64x128xi1, #blocked> loc(#loc1023) + %do_394 = ttg.async_copy_global_to_local %do_ptrs_360, %do_391 mask %do_ptrs_393 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_395 = ttg.async_commit_group tokens %do_394 loc(#loc964) + %Di_396 = tt.addptr %Di, %offs_m1_364 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %Di_397 = ttg.memdesc_index %Di_139[%do_ptrs_369] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_398 = ttg.async_copy_global_to_local %Di_396, %Di_397 mask %do_ptrs_385 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_399 = ttg.async_commit_group tokens %Di_398 loc(#loc777) + scf.yield %dk_338, %dv_327, %qT_ptrs_357, %offs_m1_365, %do_ptrs_360, %offs_m1_366, %offs_m1_364, %do_ptrs_372, %do_ptrs_298, %do_ptrs_369, %do_ptrs_295, %offs_m1_278, %qT_281, %qT_380, %lse_283, %lse_387, %do_285, %do_395, %Di_287, %Di_399, %offset_354, %offs_m1_340 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + } loc(#loc1023) + %do_ptrs_199:2 = ttng.warp_group_dot_wait %do_ptrs_198#1, %do_ptrs_198#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc1023) + %do_ptrs_200 = ttg.async_wait {num = 0 : i32} loc(#loc1023) + ttg.local_dealloc %Di_139 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1023) + ttg.local_dealloc %do_138 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc1023) + ttg.local_dealloc %lse_137 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1023) + ttg.local_dealloc %qT_136 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1023) + %qT_ptrs_201 = tt.addptr %qT_ptrs_127, %qT_ptrs_98 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc816) + %qT_ptrs_202 = tt.broadcast %qT_ptrs_201 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc817) + %qT_ptrs_203 = tt.addptr %qT_ptrs_202, %qT_ptrs_73 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc817) + %do_ptrs_204 = tt.addptr %do_ptrs_131, %do_ptrs_100 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc818) + %do_ptrs_205 = tt.broadcast %do_ptrs_204 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc819) + %do_ptrs_206 = tt.addptr %do_ptrs_205, %do_ptrs_75 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc819) + %qT_207 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc985) + %lse_208 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc821) + %do_209 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc986) + %Di_210 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc823) + %do_ptrs_211 = arith.cmpi sgt, %hi_102, %c0_i32 : i32 loc(#loc1024) + %qT_212 = arith.cmpi slt, %qT_ptrs_97, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc987) + %qT_213 = tt.broadcast %qT_212 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc985) + %qT_214 = ttg.memdesc_index %qT_207[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %do_ptrs_215 = tt.splat %do_ptrs_211 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1024) + %do_ptrs_216 = arith.andi %do_ptrs_215, %qT_213 : tensor<128x64xi1, #blocked1> loc(#loc1024) + %qT_217 = ttg.async_copy_global_to_local %qT_ptrs_203, %qT_214 mask %do_ptrs_216 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %qT_218 = ttg.async_commit_group tokens %qT_217 loc(#loc985) + %lse_219 = arith.cmpi slt, %offs_m1_94, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc824) + %lse_220 = tt.addptr %lse_135, %offs_m1_94 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc825) + %lse_221 = ttg.memdesc_index %lse_208[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %do_ptrs_222 = tt.splat %do_ptrs_211 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %do_ptrs_223 = arith.andi %do_ptrs_222, %lse_219 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %lse_224 = ttg.async_copy_global_to_local %lse_220, %lse_221 mask %do_ptrs_223 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_225 = ttg.async_commit_group tokens %lse_224 loc(#loc821) + %do_226 = arith.cmpi slt, %do_ptrs_99, %do : tensor<64x1xi32, #blocked> loc(#loc988) + %do_227 = tt.broadcast %do_226 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc986) + %do_228 = ttg.memdesc_index %do_209[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_ptrs_229 = tt.splat %do_ptrs_211 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1024) + %do_ptrs_230 = arith.andi %do_ptrs_229, %do_227 : tensor<64x128xi1, #blocked> loc(#loc1024) + %do_231 = ttg.async_copy_global_to_local %do_ptrs_206, %do_228 mask %do_ptrs_230 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_232 = ttg.async_commit_group tokens %do_231 loc(#loc986) + %Di_233 = tt.addptr %Di, %offs_m1_94 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc826) + %Di_234 = ttg.memdesc_index %Di_210[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_235 = ttg.async_copy_global_to_local %Di_233, %Di_234 mask %do_ptrs_223 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_236 = ttg.async_commit_group tokens %Di_235 loc(#loc823) + %do_ptrs_237 = arith.cmpi sgt, %hi_102, %c1_i32 : i32 loc(#loc1024) + %qT_ptrs_238 = tt.addptr %qT_ptrs_203, %cst_14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc827) + %do_ptrs_239 = tt.addptr %do_ptrs_206, %cst_15 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc828) + %offs_m1_240 = arith.addi %offs_m1_94, %cst_16 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc829) + %offs_m1_241 = arith.addi %offs_m1_95, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc829) + %offs_m1_242 = arith.addi %offs_m1_96, %cst_17 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc829) + %qT_243 = tt.expand_dims %offs_m1_241 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc989) + %qT_244 = arith.cmpi slt, %qT_243, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc987) + %qT_245 = tt.broadcast %qT_244 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc985) + %qT_246 = ttg.memdesc_index %qT_207[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %do_ptrs_247 = tt.splat %do_ptrs_237 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1024) + %do_ptrs_248 = arith.andi %do_ptrs_247, %qT_245 : tensor<128x64xi1, #blocked1> loc(#loc1024) + %qT_249 = ttg.async_copy_global_to_local %qT_ptrs_238, %qT_246 mask %do_ptrs_248 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %qT_250 = ttg.async_commit_group tokens %qT_249 loc(#loc985) + %lse_251 = arith.cmpi slt, %offs_m1_240, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc824) + %lse_252 = tt.addptr %lse_135, %offs_m1_240 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc825) + %lse_253 = ttg.memdesc_index %lse_208[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %do_ptrs_254 = tt.splat %do_ptrs_237 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %do_ptrs_255 = arith.andi %do_ptrs_254, %lse_251 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %lse_256 = ttg.async_copy_global_to_local %lse_252, %lse_253 mask %do_ptrs_255 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_257 = ttg.async_commit_group tokens %lse_256 loc(#loc821) + %do_258 = tt.expand_dims %offs_m1_242 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc990) + %do_259 = arith.cmpi slt, %do_258, %do : tensor<64x1xi32, #blocked> loc(#loc988) + %do_260 = tt.broadcast %do_259 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc986) + %do_261 = ttg.memdesc_index %do_209[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_ptrs_262 = tt.splat %do_ptrs_237 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1024) + %do_ptrs_263 = arith.andi %do_ptrs_262, %do_260 : tensor<64x128xi1, #blocked> loc(#loc1024) + %do_264 = ttg.async_copy_global_to_local %do_ptrs_239, %do_261 mask %do_ptrs_263 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_265 = ttg.async_commit_group tokens %do_264 loc(#loc986) + %Di_266 = tt.addptr %Di, %offs_m1_240 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc826) + %Di_267 = ttg.memdesc_index %Di_210[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_268 = ttg.async_copy_global_to_local %Di_266, %Di_267 mask %do_ptrs_255 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_269 = ttg.async_commit_group tokens %Di_268 loc(#loc823) + %do_ptrs_270:20 = scf.for %do_ptrs_273 = %c0_i32 to %hi_102 step %c1_i32 iter_args(%do_ptrs_274 = %do_ptrs_199#1, %do_ptrs_275 = %do_ptrs_199#0, %qT_ptrs_276 = %qT_ptrs_238, %offs_m1_277 = %offs_m1_241, %do_ptrs_278 = %do_ptrs_239, %offs_m1_279 = %offs_m1_242, %offs_m1_280 = %offs_m1_240, %arg38 = %c1_i32, %arg39 = %c-1_i32, %arg40 = %c1_i32, %arg41 = %c-1_i32, %offs_m1_281 = %offs_m1_94, %qT_282 = %qT_218, %qT_283 = %qT_250, %lse_284 = %lse_225, %lse_285 = %lse_257, %do_286 = %do_232, %do_287 = %do_265, %Di_288 = %Di_236, %Di_289 = %Di_269) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %do_ptrs_290 = arith.subi %hi_102, %c2_i32 : i32 loc(#loc1024) + %do_ptrs_291 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_290 : i32 loc(#loc1024) + %do_ptrs_292 = arith.subi %hi_102, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_293 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_292 : i32 loc(#loc1024) + %do_ptrs_294 = arith.addi %arg41, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_295 = arith.cmpi sge, %do_ptrs_294, %c2_i32 : i32 loc(#loc1024) + %do_ptrs_296 = arith.select %do_ptrs_295, %c0_i32, %do_ptrs_294 : i32 loc(#loc1024) + %do_ptrs_297 = arith.addi %arg39, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_298 = arith.cmpi sge, %do_ptrs_297, %c3_i32 : i32 loc(#loc1024) + %do_ptrs_299 = arith.select %do_ptrs_298, %c0_i32, %do_ptrs_297 : i32 loc(#loc1024) + %qT_300 = tt.expand_dims %offs_m1_281 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc989) + %qT_301 = arith.cmpi slt, %qT_300, %qT : tensor<1x64xi32, #mma> loc(#loc987) + %qT_302 = tt.broadcast %qT_301 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc985) + %qT_303 = ttg.async_wait %qT_282, %lse_284, %do_286, %Di_288 {num = 4 : i32} loc(#loc985) + %qT_304 = ttg.memdesc_index %qT_207[%do_ptrs_299] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %dk_305 = ttg.memdesc_trans %qT_304 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc830) + %lse_306 = ttg.memdesc_index %lse_208[%do_ptrs_296] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_307 = ttg.local_load %lse_306 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc821) + %lse_308 = arith.cmpf oeq, %lse_307, %cst_21 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc831) + %lse_309 = arith.select %lse_308, %cst_22, %lse_307 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc832) + %qkT = ttng.warp_group_dot %k_51, %qT_304, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc833) + %qkT_310:3 = ttng.warp_group_dot_wait %qkT, %k_51, %qT_304 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc833) + %qkT_311 = arith.mulf %qkT_310#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc834) + %post_mod_scores = arith.select %qT_302, %qkT_311, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc835) + %post_mod_scores_312 = arith.mulf %post_mod_scores, %cst_11 : tensor<128x64xf32, #mma> loc(#loc836) + %pT = tt.expand_dims %lse_309 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc837) + %pT_313 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc838) + %pT_314 = arith.subf %post_mod_scores_312, %pT_313 : tensor<128x64xf32, #mma> loc(#loc838) + %pT_315 = math.exp2 %pT_314 : tensor<128x64xf32, #mma> loc(#loc839) + %do_316 = ttg.memdesc_index %do_209[%do_ptrs_299] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %dpT = ttg.memdesc_trans %do_316 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc840) + %dv = arith.truncf %pT_315 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc841) + %dv_317 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc841) + %dv_318 = ttng.warp_group_dot %dv_317, %do_316, %do_ptrs_275 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc842) + %Di_319 = ttg.memdesc_index %Di_210[%do_ptrs_296] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_320 = ttg.local_load %Di_319 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc823) + %dpT_321 = ttng.warp_group_dot %v_56, %dpT, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc843) + %dpT_322:3 = ttng.warp_group_dot_wait %dpT_321, %v_56, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc843) + %dsT = tt.expand_dims %Di_320 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc844) + %dsT_323 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc845) + %dsT_324 = arith.subf %dpT_322#0, %dsT_323 : tensor<128x64xf32, #mma> loc(#loc845) + %dsT_325 = arith.mulf %pT_315, %dsT_324 : tensor<128x64xf32, #mma> loc(#loc846) + %grad_scores = arith.select %qT_302, %dsT_325, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc847) + %dk_326 = arith.truncf %grad_scores : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc848) + %dk_327 = ttg.convert_layout %dk_326 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc848) + %dk_328 = ttng.warp_group_dot %dk_327, %dk_305, %do_ptrs_274 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc849) + %do_ptrs_329 = arith.addi %do_ptrs_273, %c1_i32 : i32 loc(#loc1024) + %cur_block_idx = arith.divsi %do_ptrs_329, %c2_i32 : i32 loc(#loc991) + %cur_block = tt.addptr %q_indices_86, %cur_block_idx : !tt.ptr, i32 loc(#loc992) + %cur_block_330 = tt.load %cur_block, %do_ptrs_293 evictionPolicy = evict_last : !tt.ptr loc(#loc993) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc994) + %next_block_331 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_90 : i32 loc(#loc995) + %next_block_332 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc996) + %do_ptrs_333 = arith.andi %do_ptrs_293, %next_block_331 : i1 loc(#loc1024) + %next_block_334 = tt.load %next_block_332, %do_ptrs_333 evictionPolicy = evict_last : !tt.ptr loc(#loc997) + %needs_jump = arith.addi %do_ptrs_273, %c2_i32 : i32 loc(#loc998) + %needs_jump_335 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc999) + %needs_jump_336 = arith.cmpi eq, %needs_jump_335, %c0_i32 : i32 loc(#loc1000) + %jump_to_block = arith.subi %next_block_334, %cur_block_330 : i32 loc(#loc1001) + %jump_to_block_337 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc1002) + %jump_to_block_338 = arith.subi %jump_to_block_337, %c64_i32 : i32 loc(#loc1003) + %offset = arith.extui %needs_jump_336 : i1 to i32 loc(#loc1004) + %offset_339 = arith.muli %jump_to_block_338, %offset : i32 loc(#loc1004) + %offset_340 = arith.subi %c1_i32, %offset : i32 loc(#loc1005) + %offset_341 = arith.muli %offset_340, %c64_i32 : i32 loc(#loc1006) + %offset_342 = arith.addi %offset_339, %offset_341 : i32 loc(#loc1007) + %qT_ptrs_343 = arith.muli %offset_342, %c4096_i32 : i32 loc(#loc851) + %qT_ptrs_344 = tt.splat %qT_ptrs_343 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc827) + %qT_ptrs_345 = tt.addptr %qT_ptrs_276, %qT_ptrs_344 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc827) + %do_ptrs_346 = arith.muli %offset_342, %c128_i32 : i32 loc(#loc852) + %do_ptrs_347 = tt.splat %do_ptrs_346 : i32 -> tensor<64x128xi32, #blocked> loc(#loc828) + %do_ptrs_348 = tt.addptr %do_ptrs_278, %do_ptrs_347 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc828) + %offs_m1_349 = tt.splat %offset_342 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc829) + %offs_m1_350 = tt.splat %offset_342 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc829) + %offs_m1_351 = tt.splat %offset_342 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc829) + %offs_m1_352 = arith.addi %offs_m1_280, %offs_m1_349 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc829) + %offs_m1_353 = arith.addi %offs_m1_277, %offs_m1_350 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc829) + %offs_m1_354 = arith.addi %offs_m1_279, %offs_m1_351 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc829) + %do_ptrs_355 = arith.addi %arg40, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_356 = arith.cmpi sge, %do_ptrs_355, %c2_i32 : i32 loc(#loc1024) + %do_ptrs_357 = arith.select %do_ptrs_356, %c0_i32, %do_ptrs_355 : i32 loc(#loc1024) + %do_ptrs_358 = arith.addi %arg38, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_359 = arith.cmpi sge, %do_ptrs_358, %c3_i32 : i32 loc(#loc1024) + %do_ptrs_360 = arith.select %do_ptrs_359, %c0_i32, %do_ptrs_358 : i32 loc(#loc1024) + %qT_361 = tt.expand_dims %offs_m1_353 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc989) + %qT_362 = arith.cmpi slt, %qT_361, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc987) + %qT_363 = tt.broadcast %qT_362 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc985) + %qT_364 = ttg.memdesc_index %qT_207[%do_ptrs_360] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %do_ptrs_365 = tt.splat %do_ptrs_291 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1024) + %do_ptrs_366 = arith.andi %do_ptrs_365, %qT_363 : tensor<128x64xi1, #blocked1> loc(#loc1024) + %qT_367 = ttg.async_copy_global_to_local %qT_ptrs_345, %qT_364 mask %do_ptrs_366 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %qT_368 = ttg.async_commit_group tokens %qT_367 loc(#loc985) + %lse_369 = arith.cmpi slt, %offs_m1_352, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc824) + %lse_370 = tt.addptr %lse_135, %offs_m1_352 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc825) + %lse_371 = ttg.memdesc_index %lse_208[%do_ptrs_357] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %do_ptrs_372 = tt.splat %do_ptrs_291 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %do_ptrs_373 = arith.andi %do_ptrs_372, %lse_369 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %lse_374 = ttg.async_copy_global_to_local %lse_370, %lse_371 mask %do_ptrs_373 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_375 = ttg.async_commit_group tokens %lse_374 loc(#loc821) + %do_376 = tt.expand_dims %offs_m1_354 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc990) + %do_377 = arith.cmpi slt, %do_376, %do : tensor<64x1xi32, #blocked> loc(#loc988) + %do_378 = tt.broadcast %do_377 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc986) + %do_379 = ttg.memdesc_index %do_209[%do_ptrs_360] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_ptrs_380 = tt.splat %do_ptrs_291 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1024) + %do_ptrs_381 = arith.andi %do_ptrs_380, %do_378 : tensor<64x128xi1, #blocked> loc(#loc1024) + %do_382 = ttg.async_copy_global_to_local %do_ptrs_348, %do_379 mask %do_ptrs_381 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_383 = ttg.async_commit_group tokens %do_382 loc(#loc986) + %Di_384 = tt.addptr %Di, %offs_m1_352 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc826) + %Di_385 = ttg.memdesc_index %Di_210[%do_ptrs_357] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_386 = ttg.async_copy_global_to_local %Di_384, %Di_385 mask %do_ptrs_373 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_387 = ttg.async_commit_group tokens %Di_386 loc(#loc823) + scf.yield %dk_328, %dv_318, %qT_ptrs_345, %offs_m1_353, %do_ptrs_348, %offs_m1_354, %offs_m1_352, %do_ptrs_360, %do_ptrs_299, %do_ptrs_357, %do_ptrs_296, %offs_m1_280, %qT_283, %qT_368, %lse_285, %lse_375, %do_287, %do_383, %Di_289, %Di_387 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc1024) + } loc(#loc1024) + %do_ptrs_271:2 = ttng.warp_group_dot_wait %do_ptrs_270#1, %do_ptrs_270#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc1024) + %do_ptrs_272 = ttg.async_wait {num = 0 : i32} loc(#loc1024) + ttg.local_dealloc %Di_210 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1024) + ttg.local_dealloc %do_209 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc1024) + ttg.local_dealloc %lse_208 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1024) + ttg.local_dealloc %qT_207 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1024) + scf.yield %do_ptrs_271#0, %do_ptrs_271#1 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc321) + } loc(#loc771) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc659) + %dv_ptrs_103 = tt.addptr %dv_ptrs, %ptr_39 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc659) + %dv_ptrs_104 = tt.broadcast %dv_ptrs_103 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc660) + %dv_ptrs_105 = tt.addptr %dv_ptrs_104, %ptr_45 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc660) + %12 = arith.cmpi slt, %ptr_43, %cst_1 : tensor<1x128xi32, #blocked> loc(#loc324) + %13 = tt.broadcast %12 : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc325) + %14 = arith.andi %k_49, %13 : tensor<128x128xi1, #blocked> loc(#loc325) + %15 = arith.truncf %dk#0 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc326) + %16 = ttg.convert_layout %15 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc326) + tt.store %dv_ptrs_105, %16, %14 : tensor<128x128x!tt.ptr, #blocked> loc(#loc326) + %dk_106 = arith.mulf %dk#1, %cst_6 : tensor<128x128xf32, #mma1> loc(#loc661) + %xindex = tt.broadcast %ptr_39 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc662) + %xindex_107 = arith.addi %ptr_45, %xindex : tensor<128x128xi32, #blocked> loc(#loc662) + %xindex_108 = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc663) + %xindex_109 = arith.muli %xindex_108, %ks1 : i32 loc(#loc664) + %xindex_110 = tt.splat %xindex_109 : i32 -> tensor<128x128xi32, #blocked> loc(#loc665) + %xindex_111 = arith.addi %xindex_107, %xindex_110 : tensor<128x128xi32, #blocked> loc(#loc665) + %xindex_112 = arith.muli %off_zq, %c1024_i32 : i32 loc(#loc666) + %xindex_113 = arith.muli %xindex_112, %ks1 : i32 loc(#loc667) + %xindex_114 = tt.splat %xindex_113 : i32 -> tensor<128x128xi32, #blocked> loc(#loc668) + %xindex_115 = arith.addi %xindex_111, %xindex_114 : tensor<128x128xi32, #blocked> loc(#loc668) + %17 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc335) + %18 = tt.addptr %17, %xindex_115 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc335) + %19 = arith.truncf %dk_106 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc336) + %20 = ttg.convert_layout %19 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc336) + tt.store %18, %20, %k_49 : tensor<128x128x!tt.ptr, #blocked> loc(#loc336) + } loc(#loc32) + tt.return loc(#loc337) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":94:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:63) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:74) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:66) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:100) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:91) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:82) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:59) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:111) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":111:24) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":112:36) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":113:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":115:27) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":116:28) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":117:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:25) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:47) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:35) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:59) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:37) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:61) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":131:9) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":132:9) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":133:10) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":136:26) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:14) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:7) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":140:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:54) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:44) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":145:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":148:30) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":154:55) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":154:78) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:50) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:83) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:68) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:30) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:52) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:63) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:32) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:55) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:42) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:66) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:46) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:56) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":163:17) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":164:19) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":167:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":168:21) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":169:25) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":174:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":175:29) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:27) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":178:107) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:38) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:56) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:49) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:52) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":179:111) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:34) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:33) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":190:30) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":190:50) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":191:18) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":195:30) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":196:27) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":196:41) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":197:53) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":197:39) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":199:42) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":199:29) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:26) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":207:12) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:18) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:56) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:49) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:18) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:49) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:43) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:90) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:101) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:63) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:52) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":458:105) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":405:12) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":798:21) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":467:46) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":482:23) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":485:34) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":397:28) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":485:23) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":486:22) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":487:23) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":488:23) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":489:23) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":493:24) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":498:92) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":499:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":507:25) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":510:25) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":525:39) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:22) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:19) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:23) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":528:104) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":414:19) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":415:19) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":417:19) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:41) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":459:19) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:30) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":461:14) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":464:46) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":476:79) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":483:23) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":490:23) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":494:24) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":496:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":497:92) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":500:24) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":501:24) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":502:39) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":503:25) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":504:24) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":505:24) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":506:23) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":508:25) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":509:92) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":511:24) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":512:24) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":513:39) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":514:25) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":515:24) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":516:24) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":521:69) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":524:27) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":525:21) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":530:20) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:14) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":538:71) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":549:43) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":551:15) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:21) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":788:33) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":411:64) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":789:38) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":789:24) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:109) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:113) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:55) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:25) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:30) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:35) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:60) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:34) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:48) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:63) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:29) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:47) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:61) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:42) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":414:28) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":214:39) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":215:31) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":215:45) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":216:62) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":216:43) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":218:33) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":226:16) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:24) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:56) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":232:14) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:87) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:69) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:30) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":245:29) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":252:25) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":253:29) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":256:107) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":257:107) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":263:32) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:56) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:59) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:34) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":281:58) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":281:80) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:53) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:81) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:70) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":286:32) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":287:30) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":287:43) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":288:55) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":288:42) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":290:45) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":290:32) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:26) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":298:16) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:37) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:56) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:49) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:27) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:38) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:51) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:42) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:87) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:98) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:61) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":669:105) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":618:12) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:52) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":683:46) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":698:25) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":699:25) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":701:35) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":610:28) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":701:24) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":702:24) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":704:24) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":705:24) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":709:25) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":710:25) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":712:25) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":713:92) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":714:92) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":715:25) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":716:24) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":717:24) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":718:39) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":719:25) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":720:24) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":721:24) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":723:25) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":726:25) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":731:24) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":741:99) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":306:41) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":307:34) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":307:47) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":308:64) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":308:46) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":310:36) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":318:20) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":676:20) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":262:30) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":263:51) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:34) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:44) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:67) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:36) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:46) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:70) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:39) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:50) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:60) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":271:21) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":272:23) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":275:25) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":276:29) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:18) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:19) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:28) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:29) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:22) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:21) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":626:19) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":627:19) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":628:19) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:41) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:52) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":675:26) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":675:46) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":678:15) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":680:46) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":692:78) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":703:25) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":706:24) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":722:24) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":724:25) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":725:92) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":727:24) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":728:24) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":729:39) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":730:25) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":732:24) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":736:69) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":739:27) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:44) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:40) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:22) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":750:29) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:24) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:43) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":750:20) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:25) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:22) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:16) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":759:70) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":773:45) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:24) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:43) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":623:62) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":626:28) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":627:28) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":303:12) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:23) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:55) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:71) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:61) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:30) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":334:14) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:27) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:45) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:53) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:41) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:64) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:71) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:59) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":345:29) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":345:69) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:4) +#loc365 = loc("pid"(#loc12)) +#loc366 = loc("NUM_KV_BLOCKS"(#loc14)) +#loc367 = loc("NUM_Q_BLOCKS"(#loc16)) +#loc368 = loc("off_zq"(#loc17)) +#loc369 = loc("off_hkv"(#loc18)) +#loc370 = loc("off_zkv"(#loc19)) +#loc371 = loc("k_adj"(#loc20)) +#loc372 = loc("k_adj"(#loc21)) +#loc373 = loc("k_adj"(#loc22)) +#loc374 = loc("k_adj"(#loc23)) +#loc375 = loc("dv_adj"(#loc24)) +#loc376 = loc("dv_adj"(#loc25)) +#loc377 = loc("dv_adj"(#loc26)) +#loc378 = loc("K"(#loc27)) +#loc379 = loc("V"(#loc28)) +#loc380 = loc("DV"(#loc29)) +#loc381 = loc("offs_k"(#loc30)) +#loc382 = loc("off_pid"(#loc33)) +#loc383 = loc("off_hq2"(#loc34)) +#loc384 = loc("off_hq2"(#loc35)) +#loc385 = loc("off_hq2"(#loc36)) +#loc386 = loc("start_m2_block"(#loc37)) +#loc387 = loc("stride_kv_idx_h"(#loc38)) +#loc388 = loc("sparse_kv_num_blks_offset"(#loc39)) +#loc389 = loc("sparse_kv_num_blks_offset"(#loc40)) +#loc390 = loc("sparse_kv_idx_offset"(#loc41)) +#loc391 = loc("sparse_kv_idx_offset"(#loc42)) +#loc392 = loc("sparse_kv_idx_offset"(#loc43)) +#loc393 = loc("q_adj2"(#loc44)) +#loc394 = loc("q_adj2"(#loc45)) +#loc395 = loc("q_adj2"(#loc46)) +#loc396 = loc("q_adj2"(#loc47)) +#loc397 = loc("do_adj2"(#loc48)) +#loc398 = loc("do_adj2"(#loc49)) +#loc399 = loc("do_adj2"(#loc50)) +#loc400 = loc("do_adj2"(#loc51)) +#loc401 = loc("off_chz2"(#loc52)) +#loc402 = loc("off_chz2"(#loc53)) +#loc403 = loc("off_chz2"(#loc54)) +#loc404 = loc("off_chz2"(#loc55)) +#loc405 = loc("Q2"(#loc56)) +#loc406 = loc("DO2"(#loc57)) +#loc407 = loc("DQ2"(#loc58)) +#loc408 = loc("LSE2"(#loc59)) +#loc409 = loc("DELTA2"(#loc60)) +#loc410 = loc("start_m2"(#loc61)) +#loc411 = loc("offs_m2"(#loc62)) +#loc412 = loc("ptr"(#loc63)) +#loc413 = loc("q"(#loc64)) +#loc414 = loc("ptr"(#loc65)) +#loc415 = loc("ptr"(#loc66)) +#loc416 = loc("ptr"(#loc67)) +#loc417 = loc("ptr"(#loc68)) +#loc418 = loc("do"(#loc71)) +#loc419 = loc("Di"(#loc72)) +#loc420 = loc("Di"(#loc73)) +#loc421 = loc("Di"(#loc74)) +#loc422 = loc("lse"(#loc75)) +#loc423 = loc("lse"(#loc76)) +#loc424 = loc("lse"(#loc77)) +#loc425 = loc("lse"(#loc78)) +#loc426 = loc("lse"(#loc79)) +#loc427 = loc("kv_indices"(#loc80)) +#loc428 = loc("kv_start"(#loc81)) +#loc429 = loc("kv_start"(#loc82)) +#loc430 = loc("sparse_kv_num_blocks"(#loc83)) +#loc431 = loc("sparse_kv_num_blocks"(#loc84)) +#loc432 = loc("offs_n2"(#loc85)) +#loc433 = loc("offs_n2"(#loc86)) +#loc434 = loc("kT_ptrs"(#loc87)) +#loc435 = loc("dq"(#loc88)) +#loc436 = loc("kT_ptrs"(#loc89)) +#loc437 = loc("kT_ptrs"(#loc90)) +#loc438 = loc("kT_ptrs"(#loc91)) +#loc439 = loc("kT_ptrs"(#loc92)) +#loc440 = loc("vT_ptrs"(#loc93)) +#loc441 = loc("vT_ptrs"(#loc94)) +#loc442 = loc("hi"(#loc95)) +#loc443 = loc("hi"(#loc96)) +#loc444 = loc("hi"(#loc97)) +#loc445 = loc("hi"(#loc98)) +#loc446 = loc("kT"(#loc100)) +#loc447 = loc("dq"(#loc101)) +#loc448 = loc("m"(#loc103)) +#loc449 = loc("tmp4"(#loc104)) +#loc450 = loc("tmp7"(#loc105)) +#loc451 = loc("dq"(#loc106)) +#loc452 = loc("tmp7"(#loc107)) +#loc453 = loc("tmp8"(#loc108)) +#loc454 = loc("tmp9"(#loc109)) +#loc455 = loc("tmp10"(#loc110)) +#loc456 = loc("tmp11"(#loc111)) +#loc457 = loc("tmp15"(#loc112)) +#loc458 = loc("tmp20"(#loc113)) +#loc459 = loc("tmp21"(#loc114)) +#loc460 = loc("tmp29"(#loc115)) +#loc461 = loc("tmp32"(#loc116)) +#loc462 = loc("p"(#loc117)) +#loc463 = loc("ds"(#loc118)) +#loc464 = loc("ds"(#loc119)) +#loc465 = loc("vT"(#loc121)) +#loc466 = loc("kT_ptrs"(#loc122)) +#loc467 = loc("vT_ptrs"(#loc123)) +#loc468 = loc("offs_n2"(#loc124)) +#loc469 = loc("qk"(#loc126)) +#loc470 = loc("dq"(#loc127)) +#loc471 = loc("qk"(#loc128)) +#loc472 = loc("n"(#loc129)) +#loc473 = loc("post_mod_scores"(#loc130)) +#loc474 = loc("tmp5"(#loc131)) +#loc475 = loc("tmp12"(#loc132)) +#loc476 = loc("tmp16"(#loc133)) +#loc477 = loc("tmp18"(#loc134)) +#loc478 = loc("tmp19"(#loc135)) +#loc479 = loc("tmp22"(#loc136)) +#loc480 = loc("tmp23"(#loc137)) +#loc481 = loc("tmp24"(#loc138)) +#loc482 = loc("tmp25"(#loc139)) +#loc483 = loc("tmp26"(#loc140)) +#loc484 = loc("tmp27"(#loc141)) +#loc485 = loc("tmp28"(#loc142)) +#loc486 = loc("tmp30"(#loc143)) +#loc487 = loc("tmp31"(#loc144)) +#loc488 = loc("tmp33"(#loc145)) +#loc489 = loc("tmp34"(#loc146)) +#loc490 = loc("tmp35"(#loc147)) +#loc491 = loc("tmp36"(#loc148)) +#loc492 = loc("tmp37"(#loc149)) +#loc493 = loc("tmp38"(#loc150)) +#loc494 = loc("post_mod_scores"(#loc151)) +#loc495 = loc("post_mod_scores"(#loc152)) +#loc496 = loc("p"(#loc153)) +#loc497 = loc("dp"(#loc154)) +#loc498 = loc("ds"(#loc155)) +#loc499 = loc("grad_scores"(#loc156)) +#loc500 = loc("ds"(#loc157)) +#loc501 = loc("ds"(#loc158)) +#loc502 = loc("dq"(#loc159)) +#loc503 = loc("cur_block_idx"(#loc160)) +#loc504 = loc("offset"(#loc161)) +#loc505 = loc("cur_block"(#loc162)) +#loc506 = loc("cur_block"(#loc163)) +#loc507 = loc("next_block"(#loc164)) +#loc508 = loc("next_block"(#loc165)) +#loc509 = loc("next_block"(#loc166)) +#loc510 = loc("next_block"(#loc167)) +#loc511 = loc("needs_jump"(#loc168)) +#loc512 = loc("needs_jump"(#loc169)) +#loc513 = loc("needs_jump"(#loc170)) +#loc514 = loc("jump_to_block"(#loc171)) +#loc515 = loc("jump_to_block"(#loc172)) +#loc516 = loc("jump_to_block"(#loc173)) +#loc517 = loc("offset"(#loc174)) +#loc518 = loc("offset"(#loc175)) +#loc519 = loc("offset"(#loc176)) +#loc520 = loc("offset"(#loc177)) +#loc521 = loc("kT_ptrs"(#loc178)) +#loc522 = loc("kv_indices"(#loc179)) +#loc523 = loc("kv_start"(#loc180)) +#loc524 = loc("kv_start"(#loc181)) +#loc525 = loc("sparse_kv_num_blocks"(#loc182)) +#loc526 = loc("sparse_kv_num_blocks"(#loc183)) +#loc527 = loc("offs_n2"(#loc184)) +#loc528 = loc("dq"(#loc185)) +#loc529 = loc("dq_ptrs"(#loc186)) +#loc530 = loc("dq_ptrs"(#loc187)) +#loc531 = loc("dq"(#loc188)) +#loc532 = loc("stride_q_idx_h"(#loc192)) +#loc533 = loc("start_n1"(#loc193)) +#loc534 = loc("offs_n1"(#loc194)) +#loc535 = loc("k"(#loc195)) +#loc536 = loc("v"(#loc196)) +#loc537 = loc("off_hq1"(#loc197)) +#loc538 = loc("q_adj1"(#loc198)) +#loc539 = loc("do_adj1"(#loc199)) +#loc540 = loc("off_chz1"(#loc200)) +#loc541 = loc("sparse_q_num_blks_offset"(#loc201)) +#loc542 = loc("sparse_q_num_blks_offset"(#loc202)) +#loc543 = loc("sparse_q_idx_offset"(#loc203)) +#loc544 = loc("sparse_q_idx_offset"(#loc204)) +#loc545 = loc("sparse_q_idx_offset"(#loc205)) +#loc546 = loc("q_indices"(#loc206)) +#loc547 = loc("q_start"(#loc207)) +#loc548 = loc("q_start"(#loc208)) +#loc549 = loc("sparse_q_num_blocks"(#loc209)) +#loc550 = loc("sparse_q_num_blocks"(#loc210)) +#loc551 = loc("offs_m1"(#loc211)) +#loc552 = loc("offs_m1"(#loc212)) +#loc553 = loc("qT_ptrs"(#loc213)) +#loc554 = loc("qT_ptrs"(#loc215)) +#loc555 = loc("qT_ptrs"(#loc216)) +#loc556 = loc("qT_ptrs"(#loc217)) +#loc557 = loc("do_ptrs"(#loc218)) +#loc558 = loc("do_ptrs"(#loc219)) +#loc559 = loc("do_ptrs"(#loc220)) +#loc560 = loc("hi"(#loc221)) +#loc561 = loc("hi"(#loc222)) +#loc562 = loc("hi"(#loc223)) +#loc563 = loc("hi"(#loc224)) +#loc564 = loc("qT"(#loc225)) +#loc565 = loc(callsite(#loc226 at #loc214)) +#loc566 = loc("lse"(#loc227)) +#loc567 = loc("n"(#loc228)) +#loc568 = loc("tmp44"(#loc229)) +#loc569 = loc("tmp45"(#loc230)) +#loc570 = loc("tmp47"(#loc231)) +#loc571 = loc("dk"(#loc232)) +#loc572 = loc("tmp47"(#loc233)) +#loc573 = loc("tmp48"(#loc234)) +#loc574 = loc("tmp50"(#loc235)) +#loc575 = loc("tmp51"(#loc236)) +#loc576 = loc("tmp55"(#loc237)) +#loc577 = loc("tmp56"(#loc238)) +#loc578 = loc("tmp58"(#loc239)) +#loc579 = loc("tmp59"(#loc240)) +#loc580 = loc("tmp60"(#loc241)) +#loc581 = loc("tmp61"(#loc242)) +#loc582 = loc("tmp62"(#loc243)) +#loc583 = loc("tmp63"(#loc244)) +#loc584 = loc("tmp64"(#loc245)) +#loc585 = loc("tmp65"(#loc246)) +#loc586 = loc("tmp66"(#loc247)) +#loc587 = loc("tmp67"(#loc248)) +#loc588 = loc("tmp69"(#loc249)) +#loc589 = loc("tmp72"(#loc250)) +#loc590 = loc("tmp77"(#loc251)) +#loc591 = loc("do"(#loc252)) +#loc592 = loc("q_indices"(#loc253)) +#loc593 = loc("q_start"(#loc254)) +#loc594 = loc("q_start"(#loc255)) +#loc595 = loc("sparse_q_num_blocks"(#loc256)) +#loc596 = loc("sparse_q_num_blocks"(#loc257)) +#loc597 = loc("offs_m1"(#loc258)) +#loc598 = loc("qkT"(#loc260)) +#loc599 = loc("dv"(#loc261)) +#loc600 = loc("off_hq1"(#loc262)) +#loc601 = loc("q_adj1"(#loc263)) +#loc602 = loc("q_adj1"(#loc264)) +#loc603 = loc("q_adj1"(#loc265)) +#loc604 = loc("do_adj1"(#loc266)) +#loc605 = loc("do_adj1"(#loc267)) +#loc606 = loc("do_adj1"(#loc268)) +#loc607 = loc("off_chz1"(#loc269)) +#loc608 = loc("off_chz1"(#loc270)) +#loc609 = loc("off_chz1"(#loc271)) +#loc610 = loc("Q1"(#loc272)) +#loc611 = loc("DO1"(#loc273)) +#loc612 = loc("LSE1"(#loc274)) +#loc613 = loc("DELTA1"(#loc275)) +#loc614 = loc("qT_ptrs"(#loc276)) +#loc615 = loc("do_ptrs"(#loc277)) +#loc616 = loc("lse"(#loc278)) +#loc617 = loc("Di"(#loc279)) +#loc618 = loc("lse"(#loc280)) +#loc619 = loc("Di"(#loc281)) +#loc620 = loc("qT_ptrs"(#loc282)) +#loc621 = loc("do_ptrs"(#loc283)) +#loc622 = loc("offs_m1"(#loc284)) +#loc623 = loc("dk"(#loc286)) +#loc624 = loc("lse"(#loc287)) +#loc625 = loc("lse"(#loc288)) +#loc626 = loc("qkT"(#loc289)) +#loc627 = loc("m"(#loc290)) +#loc628 = loc("post_mod_scores"(#loc291)) +#loc629 = loc("tmp49"(#loc292)) +#loc630 = loc("tmp52"(#loc293)) +#loc631 = loc("tmp68"(#loc294)) +#loc632 = loc("tmp70"(#loc295)) +#loc633 = loc("tmp71"(#loc296)) +#loc634 = loc("tmp73"(#loc297)) +#loc635 = loc("tmp74"(#loc298)) +#loc636 = loc("tmp75"(#loc299)) +#loc637 = loc("tmp76"(#loc300)) +#loc638 = loc("tmp78"(#loc301)) +#loc639 = loc("post_mod_scores"(#loc302)) +#loc640 = loc("post_mod_scores"(#loc303)) +#loc641 = loc("pT"(#loc304)) +#loc642 = loc("pT"(#loc305)) +#loc643 = loc("pT"(#loc306)) +#loc644 = loc("dpT"(#loc307)) +#loc645 = loc("dv"(#loc308)) +#loc646 = loc("dv"(#loc309)) +#loc647 = loc("dpT"(#loc310)) +#loc648 = loc("dsT"(#loc311)) +#loc649 = loc("dsT"(#loc312)) +#loc650 = loc("dsT"(#loc313)) +#loc651 = loc("grad_scores"(#loc314)) +#loc652 = loc("dsT"(#loc315)) +#loc653 = loc("dk"(#loc316)) +#loc654 = loc("dk"(#loc317)) +#loc655 = loc("offset"(#loc318)) +#loc656 = loc("qT_ptrs"(#loc319)) +#loc657 = loc("do_ptrs"(#loc320)) +#loc658 = loc(callsite(#loc226 at #loc259)) +#loc659 = loc("dv_ptrs"(#loc322)) +#loc660 = loc("dv_ptrs"(#loc323)) +#loc661 = loc("dk"(#loc327)) +#loc662 = loc("xindex"(#loc328)) +#loc663 = loc("xindex"(#loc329)) +#loc664 = loc("xindex"(#loc330)) +#loc665 = loc("xindex"(#loc331)) +#loc666 = loc("xindex"(#loc332)) +#loc667 = loc("xindex"(#loc333)) +#loc668 = loc("xindex"(#loc334)) +#loc669 = loc(callsite(#loc13 at #loc366)) +#loc670 = loc(callsite(#loc15 at #loc366)) +#loc671 = loc(callsite(#loc13 at #loc367)) +#loc672 = loc(callsite(#loc15 at #loc367)) +#loc673 = loc(callsite(#loc412 at #loc413)) +#loc674 = loc(callsite(#loc414 at #loc413)) +#loc675 = loc(callsite(#loc415 at #loc413)) +#loc676 = loc(callsite(#loc416 at #loc413)) +#loc677 = loc(callsite(#loc417 at #loc413)) +#loc678 = loc(callsite(#loc69 at #loc413)) +#loc679 = loc(callsite(#loc70 at #loc413)) +#loc680 = loc(callsite(#loc414 at #loc418)) +#loc681 = loc(callsite(#loc415 at #loc418)) +#loc682 = loc(callsite(#loc417 at #loc418)) +#loc683 = loc(callsite(#loc70 at #loc418)) +#loc684 = loc(callsite(#loc434 at #loc435)) +#loc685 = loc(callsite(#loc436 at #loc435)) +#loc686 = loc(callsite(#loc437 at #loc435)) +#loc687 = loc(callsite(#loc438 at #loc435)) +#loc688 = loc(callsite(#loc439 at #loc435)) +#loc689 = loc(callsite(#loc440 at #loc435)) +#loc690 = loc(callsite(#loc441 at #loc435)) +#loc691 = loc(callsite(#loc442 at #loc435)) +#loc692 = loc(callsite(#loc443 at #loc435)) +#loc693 = loc(callsite(#loc444 at #loc435)) +#loc694 = loc(callsite(#loc445 at #loc435)) +#loc695 = loc(callsite(#loc447 at #loc435)) +#loc696 = loc("offs_n2"(#loc451)) +#loc697 = loc(callsite(#loc466 at #loc435)) +#loc698 = loc(callsite(#loc467 at #loc435)) +#loc699 = loc(callsite(#loc468 at #loc435)) +#loc700 = loc(callsite(#loc504 at #loc435)) +#loc701 = loc(callsite(#loc521 at #loc435)) +#loc702 = loc(callsite(#loc434 at #loc528)) +#loc703 = loc(callsite(#loc436 at #loc528)) +#loc704 = loc(callsite(#loc437 at #loc528)) +#loc705 = loc(callsite(#loc439 at #loc528)) +#loc706 = loc(callsite(#loc440 at #loc528)) +#loc707 = loc(callsite(#loc441 at #loc528)) +#loc708 = loc(callsite(#loc442 at #loc528)) +#loc709 = loc(callsite(#loc445 at #loc528)) +#loc710 = loc(callsite(#loc447 at #loc528)) +#loc711 = loc(callsite(#loc466 at #loc528)) +#loc712 = loc(callsite(#loc467 at #loc528)) +#loc713 = loc(callsite(#loc468 at #loc528)) +#loc714 = loc(callsite(#loc504 at #loc528)) +#loc715 = loc(callsite(#loc521 at #loc528)) +#loc716 = loc(callsite(#loc412 at #loc535)) +#loc717 = loc(callsite(#loc414 at #loc535)) +#loc718 = loc(callsite(#loc415 at #loc535)) +#loc719 = loc(callsite(#loc416 at #loc535)) +#loc720 = loc(callsite(#loc417 at #loc535)) +#loc721 = loc(callsite(#loc69 at #loc535)) +#loc722 = loc(callsite(#loc70 at #loc535)) +#loc723 = loc(callsite(#loc415 at #loc536)) +#loc724 = loc(callsite(#loc417 at #loc536)) +#loc725 = loc(callsite(#loc70 at #loc536)) +#loc726 = loc(callsite(#loc553 at #loc214)) +#loc727 = loc(callsite(#loc554 at #loc214)) +#loc728 = loc(callsite(#loc555 at #loc214)) +#loc729 = loc(callsite(#loc556 at #loc214)) +#loc730 = loc(callsite(#loc557 at #loc214)) +#loc731 = loc(callsite(#loc558 at #loc214)) +#loc732 = loc(callsite(#loc559 at #loc214)) +#loc733 = loc(callsite(#loc560 at #loc214)) +#loc734 = loc(callsite(#loc561 at #loc214)) +#loc735 = loc(callsite(#loc562 at #loc214)) +#loc736 = loc(callsite(#loc563 at #loc214)) +#loc737 = loc(callsite(#loc564 at #loc565)) +#loc738 = loc(callsite(#loc566 at #loc565)) +#loc739 = loc(callsite(#loc567 at #loc565)) +#loc740 = loc(callsite(#loc568 at #loc565)) +#loc741 = loc(callsite(#loc569 at #loc565)) +#loc742 = loc(callsite(#loc570 at #loc565)) +#loc743 = loc("dv"(#loc571)) +#loc744 = loc(callsite(#loc572 at #loc565)) +#loc745 = loc(callsite(#loc573 at #loc565)) +#loc746 = loc(callsite(#loc574 at #loc565)) +#loc747 = loc(callsite(#loc575 at #loc565)) +#loc748 = loc(callsite(#loc576 at #loc565)) +#loc749 = loc(callsite(#loc577 at #loc565)) +#loc750 = loc(callsite(#loc578 at #loc565)) +#loc751 = loc(callsite(#loc579 at #loc565)) +#loc752 = loc(callsite(#loc580 at #loc565)) +#loc753 = loc(callsite(#loc581 at #loc565)) +#loc754 = loc(callsite(#loc582 at #loc565)) +#loc755 = loc(callsite(#loc583 at #loc565)) +#loc756 = loc(callsite(#loc584 at #loc565)) +#loc757 = loc(callsite(#loc585 at #loc565)) +#loc758 = loc(callsite(#loc586 at #loc565)) +#loc759 = loc(callsite(#loc587 at #loc565)) +#loc760 = loc(callsite(#loc588 at #loc565)) +#loc761 = loc(callsite(#loc589 at #loc565)) +#loc762 = loc(callsite(#loc590 at #loc565)) +#loc763 = loc(callsite(#loc591 at #loc565)) +#loc764 = loc(callsite(#loc553 at #loc259)) +#loc765 = loc(callsite(#loc554 at #loc259)) +#loc766 = loc(callsite(#loc557 at #loc259)) +#loc767 = loc(callsite(#loc558 at #loc259)) +#loc768 = loc(callsite(#loc560 at #loc259)) +#loc769 = loc(callsite(#loc563 at #loc259)) +#loc770 = loc(callsite(#loc598 at #loc565)) +#loc771 = loc("dk"(#loc599)) +#loc772 = loc(callsite(#loc614 at #loc214)) +#loc773 = loc(callsite(#loc615 at #loc214)) +#loc774 = loc(callsite(#loc616 at #loc565)) +#loc775 = loc(callsite(#loc617 at #loc565)) +#loc776 = loc(callsite(#loc618 at #loc565)) +#loc777 = loc(callsite(#loc619 at #loc565)) +#loc778 = loc(callsite(#loc620 at #loc214)) +#loc779 = loc(callsite(#loc621 at #loc214)) +#loc780 = loc(callsite(#loc622 at #loc214)) +#loc781 = loc(callsite(#loc623 at #loc565)) +#loc782 = loc(callsite(#loc624 at #loc565)) +#loc783 = loc(callsite(#loc625 at #loc565)) +#loc784 = loc(callsite(#loc626 at #loc565)) +#loc785 = loc(callsite(#loc627 at #loc565)) +#loc786 = loc(callsite(#loc628 at #loc565)) +#loc787 = loc(callsite(#loc629 at #loc565)) +#loc788 = loc(callsite(#loc630 at #loc565)) +#loc789 = loc(callsite(#loc631 at #loc565)) +#loc790 = loc(callsite(#loc632 at #loc565)) +#loc791 = loc(callsite(#loc633 at #loc565)) +#loc792 = loc(callsite(#loc634 at #loc565)) +#loc793 = loc(callsite(#loc635 at #loc565)) +#loc794 = loc(callsite(#loc636 at #loc565)) +#loc795 = loc(callsite(#loc637 at #loc565)) +#loc796 = loc(callsite(#loc638 at #loc565)) +#loc797 = loc(callsite(#loc639 at #loc565)) +#loc798 = loc(callsite(#loc640 at #loc565)) +#loc799 = loc(callsite(#loc641 at #loc565)) +#loc800 = loc(callsite(#loc642 at #loc565)) +#loc801 = loc(callsite(#loc643 at #loc565)) +#loc802 = loc(callsite(#loc644 at #loc565)) +#loc803 = loc(callsite(#loc645 at #loc565)) +#loc804 = loc(callsite(#loc646 at #loc565)) +#loc805 = loc(callsite(#loc647 at #loc565)) +#loc806 = loc(callsite(#loc648 at #loc565)) +#loc807 = loc(callsite(#loc649 at #loc565)) +#loc808 = loc(callsite(#loc650 at #loc565)) +#loc809 = loc(callsite(#loc651 at #loc565)) +#loc810 = loc(callsite(#loc652 at #loc565)) +#loc811 = loc(callsite(#loc653 at #loc565)) +#loc812 = loc(callsite(#loc654 at #loc565)) +#loc813 = loc(callsite(#loc655 at #loc214)) +#loc814 = loc(callsite(#loc656 at #loc214)) +#loc815 = loc(callsite(#loc657 at #loc214)) +#loc816 = loc(callsite(#loc614 at #loc259)) +#loc817 = loc(callsite(#loc556 at #loc259)) +#loc818 = loc(callsite(#loc615 at #loc259)) +#loc819 = loc(callsite(#loc559 at #loc259)) +#loc820 = loc(callsite(#loc564 at #loc658)) +#loc821 = loc(callsite(#loc618 at #loc658)) +#loc822 = loc(callsite(#loc591 at #loc658)) +#loc823 = loc(callsite(#loc619 at #loc658)) +#loc824 = loc(callsite(#loc566 at #loc658)) +#loc825 = loc(callsite(#loc616 at #loc658)) +#loc826 = loc(callsite(#loc617 at #loc658)) +#loc827 = loc(callsite(#loc620 at #loc259)) +#loc828 = loc(callsite(#loc621 at #loc259)) +#loc829 = loc(callsite(#loc622 at #loc259)) +#loc830 = loc(callsite(#loc623 at #loc658)) +#loc831 = loc(callsite(#loc624 at #loc658)) +#loc832 = loc(callsite(#loc625 at #loc658)) +#loc833 = loc(callsite(#loc598 at #loc658)) +#loc834 = loc(callsite(#loc626 at #loc658)) +#loc835 = loc(callsite(#loc628 at #loc658)) +#loc836 = loc(callsite(#loc640 at #loc658)) +#loc837 = loc(callsite(#loc641 at #loc658)) +#loc838 = loc(callsite(#loc642 at #loc658)) +#loc839 = loc(callsite(#loc643 at #loc658)) +#loc840 = loc(callsite(#loc644 at #loc658)) +#loc841 = loc(callsite(#loc645 at #loc658)) +#loc842 = loc(callsite(#loc646 at #loc658)) +#loc843 = loc(callsite(#loc647 at #loc658)) +#loc844 = loc(callsite(#loc648 at #loc658)) +#loc845 = loc(callsite(#loc649 at #loc658)) +#loc846 = loc(callsite(#loc650 at #loc658)) +#loc847 = loc(callsite(#loc651 at #loc658)) +#loc848 = loc(callsite(#loc653 at #loc658)) +#loc849 = loc(callsite(#loc654 at #loc658)) +#loc850 = loc(callsite(#loc655 at #loc259)) +#loc851 = loc(callsite(#loc656 at #loc259)) +#loc852 = loc(callsite(#loc657 at #loc259)) +#loc853 = loc(callsite(#loc13 at #loc692)) +#loc854 = loc(callsite(#loc15 at #loc692)) +#loc855 = loc(callsite(#loc446 at #loc695)) +#loc856 = loc(callsite(#loc448 at #loc695)) +#loc857 = loc(callsite(#loc449 at #loc695)) +#loc858 = loc(callsite(#loc450 at #loc695)) +#loc859 = loc("kT_ptrs"(#loc696)) +#loc860 = loc(callsite(#loc452 at #loc695)) +#loc861 = loc(callsite(#loc453 at #loc695)) +#loc862 = loc(callsite(#loc454 at #loc695)) +#loc863 = loc(callsite(#loc455 at #loc695)) +#loc864 = loc(callsite(#loc456 at #loc695)) +#loc865 = loc(callsite(#loc457 at #loc695)) +#loc866 = loc(callsite(#loc458 at #loc695)) +#loc867 = loc(callsite(#loc459 at #loc695)) +#loc868 = loc(callsite(#loc460 at #loc695)) +#loc869 = loc(callsite(#loc461 at #loc695)) +#loc870 = loc(callsite(#loc462 at #loc695)) +#loc871 = loc(callsite(#loc463 at #loc695)) +#loc872 = loc(callsite(#loc464 at #loc695)) +#loc873 = loc(callsite(#loc465 at #loc695)) +#loc874 = loc(callsite(#loc469 at #loc695)) +#loc875 = loc(callsite(#loc470 at #loc695)) +#loc876 = loc(callsite(#loc471 at #loc695)) +#loc877 = loc(callsite(#loc472 at #loc695)) +#loc878 = loc(callsite(#loc473 at #loc695)) +#loc879 = loc(callsite(#loc474 at #loc695)) +#loc880 = loc(callsite(#loc475 at #loc695)) +#loc881 = loc(callsite(#loc476 at #loc695)) +#loc882 = loc(callsite(#loc477 at #loc695)) +#loc883 = loc(callsite(#loc478 at #loc695)) +#loc884 = loc(callsite(#loc479 at #loc695)) +#loc885 = loc(callsite(#loc480 at #loc695)) +#loc886 = loc(callsite(#loc481 at #loc695)) +#loc887 = loc(callsite(#loc482 at #loc695)) +#loc888 = loc(callsite(#loc483 at #loc695)) +#loc889 = loc(callsite(#loc484 at #loc695)) +#loc890 = loc(callsite(#loc485 at #loc695)) +#loc891 = loc(callsite(#loc486 at #loc695)) +#loc892 = loc(callsite(#loc487 at #loc695)) +#loc893 = loc(callsite(#loc488 at #loc695)) +#loc894 = loc(callsite(#loc489 at #loc695)) +#loc895 = loc(callsite(#loc490 at #loc695)) +#loc896 = loc(callsite(#loc491 at #loc695)) +#loc897 = loc(callsite(#loc492 at #loc695)) +#loc898 = loc(callsite(#loc493 at #loc695)) +#loc899 = loc(callsite(#loc494 at #loc695)) +#loc900 = loc(callsite(#loc495 at #loc695)) +#loc901 = loc(callsite(#loc496 at #loc695)) +#loc902 = loc(callsite(#loc497 at #loc695)) +#loc903 = loc(callsite(#loc498 at #loc695)) +#loc904 = loc(callsite(#loc499 at #loc695)) +#loc905 = loc(callsite(#loc500 at #loc695)) +#loc906 = loc(callsite(#loc501 at #loc695)) +#loc907 = loc(callsite(#loc502 at #loc695)) +#loc908 = loc(callsite(#loc503 at #loc700)) +#loc909 = loc(callsite(#loc505 at #loc700)) +#loc910 = loc(callsite(#loc506 at #loc700)) +#loc911 = loc(callsite(#loc507 at #loc700)) +#loc912 = loc(callsite(#loc508 at #loc700)) +#loc913 = loc(callsite(#loc509 at #loc700)) +#loc914 = loc(callsite(#loc510 at #loc700)) +#loc915 = loc(callsite(#loc511 at #loc700)) +#loc916 = loc(callsite(#loc512 at #loc700)) +#loc917 = loc(callsite(#loc513 at #loc700)) +#loc918 = loc(callsite(#loc514 at #loc700)) +#loc919 = loc(callsite(#loc515 at #loc700)) +#loc920 = loc(callsite(#loc516 at #loc700)) +#loc921 = loc(callsite(#loc517 at #loc700)) +#loc922 = loc(callsite(#loc518 at #loc700)) +#loc923 = loc(callsite(#loc519 at #loc700)) +#loc924 = loc(callsite(#loc520 at #loc700)) +#loc925 = loc(callsite(#loc446 at #loc710)) +#loc926 = loc(callsite(#loc465 at #loc710)) +#loc927 = loc(callsite(#loc469 at #loc710)) +#loc928 = loc(callsite(#loc470 at #loc710)) +#loc929 = loc(callsite(#loc471 at #loc710)) +#loc930 = loc(callsite(#loc473 at #loc710)) +#loc931 = loc(callsite(#loc495 at #loc710)) +#loc932 = loc(callsite(#loc462 at #loc710)) +#loc933 = loc(callsite(#loc496 at #loc710)) +#loc934 = loc(callsite(#loc497 at #loc710)) +#loc935 = loc(callsite(#loc464 at #loc710)) +#loc936 = loc(callsite(#loc498 at #loc710)) +#loc937 = loc(callsite(#loc499 at #loc710)) +#loc938 = loc(callsite(#loc501 at #loc710)) +#loc939 = loc(callsite(#loc502 at #loc710)) +#loc940 = loc(callsite(#loc503 at #loc714)) +#loc941 = loc(callsite(#loc505 at #loc714)) +#loc942 = loc(callsite(#loc506 at #loc714)) +#loc943 = loc(callsite(#loc507 at #loc714)) +#loc944 = loc(callsite(#loc508 at #loc714)) +#loc945 = loc(callsite(#loc509 at #loc714)) +#loc946 = loc(callsite(#loc510 at #loc714)) +#loc947 = loc(callsite(#loc511 at #loc714)) +#loc948 = loc(callsite(#loc512 at #loc714)) +#loc949 = loc(callsite(#loc513 at #loc714)) +#loc950 = loc(callsite(#loc514 at #loc714)) +#loc951 = loc(callsite(#loc515 at #loc714)) +#loc952 = loc(callsite(#loc516 at #loc714)) +#loc953 = loc(callsite(#loc517 at #loc714)) +#loc954 = loc(callsite(#loc518 at #loc714)) +#loc955 = loc(callsite(#loc519 at #loc714)) +#loc956 = loc(callsite(#loc520 at #loc714)) +#loc957 = loc(callsite(#loc13 at #loc734)) +#loc958 = loc(callsite(#loc15 at #loc734)) +#loc959 = loc(callsite(#loc99 at #loc737)) +#loc960 = loc(callsite(#loc102 at #loc739)) +#loc961 = loc("offs_m1"(#loc743)) +#loc962 = loc(callsite(#loc69 at #loc763)) +#loc963 = loc(callsite(#loc120 at #loc737)) +#loc964 = loc(callsite(#loc70 at #loc763)) +#loc965 = loc(callsite(#loc125 at #loc737)) +#loc966 = loc(callsite(#loc285 at #loc763)) +#loc967 = loc(callsite(#loc102 at #loc785)) +#loc968 = loc(callsite(#loc503 at #loc813)) +#loc969 = loc(callsite(#loc505 at #loc813)) +#loc970 = loc(callsite(#loc506 at #loc813)) +#loc971 = loc(callsite(#loc507 at #loc813)) +#loc972 = loc(callsite(#loc508 at #loc813)) +#loc973 = loc(callsite(#loc509 at #loc813)) +#loc974 = loc(callsite(#loc510 at #loc813)) +#loc975 = loc(callsite(#loc511 at #loc813)) +#loc976 = loc(callsite(#loc512 at #loc813)) +#loc977 = loc(callsite(#loc513 at #loc813)) +#loc978 = loc(callsite(#loc514 at #loc813)) +#loc979 = loc(callsite(#loc515 at #loc813)) +#loc980 = loc(callsite(#loc516 at #loc813)) +#loc981 = loc(callsite(#loc517 at #loc813)) +#loc982 = loc(callsite(#loc518 at #loc813)) +#loc983 = loc(callsite(#loc519 at #loc813)) +#loc984 = loc(callsite(#loc520 at #loc813)) +#loc985 = loc(callsite(#loc120 at #loc820)) +#loc986 = loc(callsite(#loc70 at #loc822)) +#loc987 = loc(callsite(#loc99 at #loc820)) +#loc988 = loc(callsite(#loc69 at #loc822)) +#loc989 = loc(callsite(#loc125 at #loc820)) +#loc990 = loc(callsite(#loc285 at #loc822)) +#loc991 = loc(callsite(#loc503 at #loc850)) +#loc992 = loc(callsite(#loc505 at #loc850)) +#loc993 = loc(callsite(#loc506 at #loc850)) +#loc994 = loc(callsite(#loc507 at #loc850)) +#loc995 = loc(callsite(#loc508 at #loc850)) +#loc996 = loc(callsite(#loc509 at #loc850)) +#loc997 = loc(callsite(#loc510 at #loc850)) +#loc998 = loc(callsite(#loc511 at #loc850)) +#loc999 = loc(callsite(#loc512 at #loc850)) +#loc1000 = loc(callsite(#loc513 at #loc850)) +#loc1001 = loc(callsite(#loc514 at #loc850)) +#loc1002 = loc(callsite(#loc515 at #loc850)) +#loc1003 = loc(callsite(#loc516 at #loc850)) +#loc1004 = loc(callsite(#loc517 at #loc850)) +#loc1005 = loc(callsite(#loc518 at #loc850)) +#loc1006 = loc(callsite(#loc519 at #loc850)) +#loc1007 = loc(callsite(#loc520 at #loc850)) +#loc1008 = loc(callsite(#loc99 at #loc855)) +#loc1009 = loc(callsite(#loc102 at #loc856)) +#loc1010 = loc("vT_ptrs"(#loc859)) +#loc1011 = loc(callsite(#loc120 at #loc855)) +#loc1012 = loc(callsite(#loc120 at #loc873)) +#loc1013 = loc(callsite(#loc125 at #loc855)) +#loc1014 = loc(callsite(#loc102 at #loc877)) +#loc1015 = loc(callsite(#loc120 at #loc925)) +#loc1016 = loc(callsite(#loc120 at #loc926)) +#loc1017 = loc(callsite(#loc99 at #loc925)) +#loc1018 = loc(callsite(#loc125 at #loc925)) +#loc1019 = loc("qT_ptrs"(#loc961)) +#loc1020 = loc(callsite(#loc1010 at #loc435)) +#loc1021 = loc(callsite(#loc1010 at #loc528)) +#loc1022 = loc("do_ptrs"(#loc1019)) +#loc1023 = loc(callsite(#loc1022 at #loc214)) +#loc1024 = loc(callsite(#loc1022 at #loc259)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..8b8329184a32a76ee46966dcb0ecfe4218579515 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/ISV24HMA4J4LKKVO22GYN5K43WAJJLKWYPR3LOEREI3KLJOVDZ5Q/triton_tem_fused_zeros_1.ttir @@ -0,0 +1,1667 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":18:0) +#loc341 = loc("arg_Q"(#loc)) +#loc342 = loc("arg_K"(#loc)) +#loc343 = loc("arg_V"(#loc)) +#loc344 = loc("arg_LSE"(#loc)) +#loc345 = loc("arg_DELTA"(#loc)) +#loc346 = loc("arg_DO"(#loc)) +#loc347 = loc("arg_DQ"(#loc)) +#loc348 = loc("arg_DV"(#loc)) +#loc349 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc350 = loc("arg_KV_IDX"(#loc)) +#loc351 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc352 = loc("arg_Q_IDX"(#loc)) +#loc353 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc354 = loc("arg_FULL_KV_IDX"(#loc)) +#loc355 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc356 = loc("arg_FULL_Q_IDX"(#loc)) +#loc357 = loc("in_ptr16"(#loc)) +#loc358 = loc("out_ptr0"(#loc)) +#loc359 = loc("ks0"(#loc)) +#loc360 = loc("ks1"(#loc)) +#loc361 = loc("ks2"(#loc)) +#loc362 = loc("ks3"(#loc)) +#loc363 = loc("ks4"(#loc)) +#loc364 = loc("ks5"(#loc)) +#loc365 = loc("ks6"(#loc)) +#loc366 = loc("ks7"(#loc)) +#loc367 = loc("ks8"(#loc)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc)), %ks6: i32 loc("ks6"(#loc)), %ks7: i32 loc("ks7"(#loc)), %ks8: i32 loc("ks8"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %cst_6 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1) + %cst_8 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc1) + %cst_9 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_10 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xbf16> loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_15 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc1) + %cst_16 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_17 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc1) + %cst_18 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %cst_20 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %HQ = arith.constant 32 : i32 loc(#loc368) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc3) + %1 = arith.muli %ks1, %c1024_i32 : i32 loc(#loc4) + %2 = arith.muli %ks1, %c128_i32 : i32 loc(#loc5) + %3 = arith.cmpi sle, %ks0, %c1_i32 : i32 loc(#loc6) + %4 = arith.extui %3 : i1 to i32 loc(#loc7) + %5 = arith.cmpi sgt, %ks0, %c1_i32 : i32 loc(#loc8) + %6 = arith.extui %5 : i1 to i32 loc(#loc9) + %7 = arith.muli %ks0, %6 : i32 loc(#loc9) + %8 = arith.addi %4, %7 : i32 loc(#loc10) + %9 = arith.muli %8, %c4096_i32 : i32 loc(#loc11) + %10 = arith.muli %8, %c128_i32 : i32 loc(#loc12) + %pid = tt.get_program_id x : i32 loc(#loc369) + %NUM_KV_BLOCKS = arith.addi %ks1, %c127_i32 : i32 loc(#loc675) + %NUM_KV_BLOCKS_21 = arith.divsi %NUM_KV_BLOCKS, %c128_i32 : i32 loc(#loc676) + %NUM_Q_BLOCKS = arith.addi %ks0, %c127_i32 : i32 loc(#loc677) + %NUM_Q_BLOCKS_22 = arith.divsi %NUM_Q_BLOCKS, %c128_i32 : i32 loc(#loc678) + %off_zq = tt.get_program_id y : i32 loc(#loc372) + %off_hkv = tt.get_program_id z : i32 loc(#loc373) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc374) + %k_adj = arith.muli %2, %off_hkv : i32 loc(#loc375) + %k_adj_23 = arith.muli %1, %off_zkv : i32 loc(#loc376) + %k_adj_24 = arith.addi %k_adj, %k_adj_23 : i32 loc(#loc377) + %k_adj_25 = arith.extsi %k_adj_24 : i32 to i64 loc(#loc378) + %dv_adj = arith.muli %1, %off_zq : i32 loc(#loc379) + %dv_adj_26 = arith.addi %k_adj, %dv_adj : i32 loc(#loc380) + %dv_adj_27 = arith.extsi %dv_adj_26 : i32 to i64 loc(#loc381) + %K = tt.addptr %arg_K, %k_adj_25 : !tt.ptr, i64 loc(#loc382) + %V = tt.addptr %arg_V, %k_adj_25 : !tt.ptr, i64 loc(#loc383) + %DV = tt.addptr %arg_DV, %dv_adj_27 : !tt.ptr, i64 loc(#loc384) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc385) + %11 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS_21 : i32 loc(#loc32) + scf.if %11 { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS_21 : i32 loc(#loc386) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS_22 : i32 loc(#loc387) + %off_hq2_28 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc388) + %off_hq2_29 = arith.addi %off_hq2, %off_hq2_28 : i32 loc(#loc389) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS_22 : i32 loc(#loc390) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc391) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc392) + %sparse_kv_num_blks_offset_30 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc393) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc394) + %sparse_kv_idx_offset_31 = arith.muli %start_m2_block, %ks4 : i32 loc(#loc395) + %sparse_kv_idx_offset_32 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_31 : i32 loc(#loc396) + %q_adj2 = arith.muli %off_hq2_29, %c128_i32 : i32 loc(#loc397) + %q_adj2_33 = arith.muli %0, %off_zq : i32 loc(#loc398) + %q_adj2_34 = arith.addi %q_adj2, %q_adj2_33 : i32 loc(#loc399) + %q_adj2_35 = arith.extsi %q_adj2_34 : i32 to i64 loc(#loc400) + %do_adj2 = arith.muli %10, %off_hq2_29 : i32 loc(#loc401) + %do_adj2_36 = arith.muli %9, %off_zq : i32 loc(#loc402) + %do_adj2_37 = arith.addi %do_adj2, %do_adj2_36 : i32 loc(#loc403) + %do_adj2_38 = arith.extsi %do_adj2_37 : i32 to i64 loc(#loc404) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc405) + %off_chz2_39 = arith.addi %off_chz2, %off_hq2_29 : i32 loc(#loc406) + %off_chz2_40 = arith.muli %off_chz2_39, %ks0 : i32 loc(#loc407) + %off_chz2_41 = arith.extsi %off_chz2_40 : i32 to i64 loc(#loc408) + %Q2 = tt.addptr %arg_Q, %q_adj2_35 : !tt.ptr, i64 loc(#loc409) + %DO2 = tt.addptr %arg_DO, %do_adj2_38 : !tt.ptr, i64 loc(#loc410) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_35 : !tt.ptr, i64 loc(#loc411) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_41 : !tt.ptr, i64 loc(#loc412) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_41 : !tt.ptr, i64 loc(#loc413) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc414) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32> loc(#loc415) + %offs_m2_42 = arith.addi %offs_m2, %offs_k : tensor<128xi32> loc(#loc415) + %ptr = tt.expand_dims %offs_m2_42 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc679) + %ptr_43 = arith.muli %ptr, %cst_18 : tensor<128x1xi32> loc(#loc680) + %ptr_44 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc681) + %ptr_45 = tt.addptr %ptr_44, %ptr_43 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc681) + %ptr_46 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc682) + %ptr_47 = tt.broadcast %ptr_45 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc683) + %ptr_48 = tt.broadcast %ptr_46 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc683) + %ptr_49 = tt.addptr %ptr_47, %ptr_48 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc683) + %q = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc684) + %q_50 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32> loc(#loc684) + %q_51 = tt.broadcast %q_50 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc685) + %q_52 = tt.load %ptr_49, %q_51, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc685) + %ptr_53 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc686) + %ptr_54 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc687) + %ptr_55 = tt.addptr %ptr_54, %ptr_53 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc687) + %ptr_56 = tt.broadcast %ptr_55 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc688) + %ptr_57 = tt.addptr %ptr_56, %ptr_48 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc688) + %do = tt.load %ptr_57, %q_51, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc689) + %Di = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc423) + %Di_58 = arith.cmpi slt, %offs_m2_42, %Di : tensor<128xi32> loc(#loc423) + %Di_59 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc424) + %Di_60 = tt.addptr %Di_59, %offs_m2_42 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc424) + %Di_61 = tt.load %Di_60, %Di_58 : tensor<128x!tt.ptr> loc(#loc425) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc426) + %lse_62 = tt.addptr %lse, %offs_m2_42 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc426) + %lse_63 = tt.load %lse_62, %Di_58 : tensor<128x!tt.ptr> loc(#loc427) + %lse_64 = arith.cmpf oeq, %lse_63, %cst_20 : tensor<128xf32> loc(#loc428) + %lse_65 = arith.select %lse_64, %cst_19, %lse_63 : tensor<128xi1>, tensor<128xf32> loc(#loc429) + %lse_66 = tt.expand_dims %lse_65 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc430) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_32 : !tt.ptr, i32 loc(#loc431) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc432) + %kv_start_67 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc433) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_30 : !tt.ptr, i32 loc(#loc434) + %sparse_kv_num_blocks_68 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc435) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc436) + %offs_n2_69 = tt.splat %kv_start_67 : i32 -> tensor<64xi32> loc(#loc437) + %offs_n2_70 = arith.addi %offs_n2_69, %offs_n2 : tensor<64xi32> loc(#loc437) + %kT_ptrs = tt.expand_dims %offs_n2_70 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc690) + %kT_ptrs_71 = arith.muli %kT_ptrs, %cst_1 : tensor<1x64xi32> loc(#loc691) + %kT_ptrs_72 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc692) + %kT_ptrs_73 = tt.addptr %kT_ptrs_72, %kT_ptrs_71 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc692) + %kT_ptrs_74 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc693) + %kT_ptrs_75 = tt.broadcast %kT_ptrs_73 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc694) + %kT_ptrs_76 = tt.broadcast %kT_ptrs_74 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc694) + %kT_ptrs_77 = tt.addptr %kT_ptrs_75, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc694) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc695) + %vT_ptrs_78 = tt.addptr %vT_ptrs, %kT_ptrs_71 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc695) + %vT_ptrs_79 = tt.broadcast %vT_ptrs_78 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc696) + %vT_ptrs_80 = tt.addptr %vT_ptrs_79, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc696) + %hi = arith.muli %sparse_kv_num_blocks_68, %c2_i32 : i32 loc(#loc697) + %hi_81 = arith.addi %ks1, %c63_i32 : i32 loc(#loc861) + %hi_82 = arith.divsi %hi_81, %c64_i32 : i32 loc(#loc862) + %hi_83 = arith.maxsi %hi_82, %c1_i32 : i32 loc(#loc699) + %hi_84 = arith.minsi %hi, %hi_83 : i32 loc(#loc700) + %vT_ptrs_85:4 = scf.for %start_n = %c0_i32 to %hi_84 step %c1_i32 iter_args(%dq_107 = %cst_14, %offs_n2_108 = %offs_n2_70, %kT_ptrs_109 = %kT_ptrs_77, %vT_ptrs_110 = %vT_ptrs_80) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.expand_dims %offs_n2_108 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1018) + %kT_111 = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc1019) + %kT_112 = arith.cmpi slt, %kT, %kT_111 : tensor<1x64xi32> loc(#loc1019) + %kT_113 = tt.broadcast %kT_112 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1020) + %kT_114 = tt.load %kT_ptrs_109, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1020) + %qk = tt.dot %q_52, %kT_114, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc865) + %qk_115 = arith.mulf %qk, %cst_10 : tensor<128x64xf32> loc(#loc866) + %n = arith.remsi %kT, %kT_111 : tensor<1x64xi32> loc(#loc1021) + %m = arith.remsi %ptr, %q : tensor<128x1xi32> loc(#loc1022) + %post_mod_scores = arith.select %kT_113, %qk_115, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc869) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc870) + %tmp4_116 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc870) + %tmp4_117 = arith.cmpi sge, %tmp4, %tmp4_116 : tensor<128x64xi32> loc(#loc870) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc871) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc872) + %tmp7_118 = tt.load %tmp7 : !tt.ptr loc(#loc873) + %tmp8 = tt.splat %tmp7_118 : i64 -> tensor<1x64xi64> loc(#loc874) + %tmp8_119 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc874) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc875) + %tmp10 = tt.splat %tmp7_118 : i64 -> tensor<128x1xi64> loc(#loc876) + %tmp10_120 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc876) + %tmp11 = tt.broadcast %tmp8_119 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc877) + %tmp11_121 = tt.broadcast %tmp10_120 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc877) + %tmp11_122 = arith.andi %tmp11, %tmp11_121 : tensor<128x64xi1> loc(#loc877) + %tmp12 = arith.andi %tmp4_117, %tmp11_122 : tensor<128x64xi1> loc(#loc878) + %tmp15 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc879) + %tmp15_123 = arith.cmpi sge, %n, %tmp15 : tensor<1x64xi32> loc(#loc879) + %tmp16 = arith.remsi %n, %tmp15 : tensor<1x64xi32> loc(#loc880) + %tmp18 = arith.cmpi ne, %tmp16, %cst_8 : tensor<1x64xi32> loc(#loc881) + %tmp19 = arith.cmpi slt, %tmp16, %cst_8 : tensor<1x64xi32> loc(#loc882) + %tmp20 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc883) + %tmp21 = tt.splat %tmp20 : i1 -> tensor<1x64xi1> loc(#loc884) + %tmp21_124 = arith.cmpi ne, %tmp19, %tmp21 : tensor<1x64xi1> loc(#loc884) + %tmp22 = arith.andi %tmp18, %tmp21_124 : tensor<1x64xi1> loc(#loc885) + %tmp23 = arith.addi %tmp16, %tmp15 : tensor<1x64xi32> loc(#loc886) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc887) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc888) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc889) + %tmp27 = arith.andi %tmp15_123, %tmp26 : tensor<1x64xi1> loc(#loc890) + %tmp28 = arith.subi %tmp4_116, %tmp4 : tensor<128x64xi32> loc(#loc891) + %tmp29 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc892) + %tmp29_125 = arith.remsi %tmp28, %tmp29 : tensor<128x64xi32> loc(#loc892) + %tmp30 = arith.cmpi ne, %tmp29_125, %cst_7 : tensor<128x64xi32> loc(#loc893) + %tmp31 = arith.cmpi slt, %tmp29_125, %cst_7 : tensor<128x64xi32> loc(#loc894) + %tmp32 = tt.splat %tmp20 : i1 -> tensor<128x64xi1> loc(#loc895) + %tmp32_126 = arith.cmpi ne, %tmp31, %tmp32 : tensor<128x64xi1> loc(#loc895) + %tmp33 = arith.andi %tmp30, %tmp32_126 : tensor<128x64xi1> loc(#loc896) + %tmp34 = arith.addi %tmp29_125, %tmp29 : tensor<128x64xi32> loc(#loc897) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29_125 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc898) + %tmp36 = arith.cmpi eq, %tmp35, %cst_7 : tensor<128x64xi32> loc(#loc899) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc900) + %tmp37_127 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc900) + %tmp38 = arith.ori %tmp12, %tmp37_127 : tensor<128x64xi1> loc(#loc901) + %post_mod_scores_128 = arith.select %tmp38, %post_mod_scores, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc902) + %post_mod_scores_129 = arith.mulf %post_mod_scores_128, %cst_6 : tensor<128x64xf32> loc(#loc903) + %p = tt.broadcast %lse_66 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc904) + %p_130 = arith.subf %post_mod_scores_129, %p : tensor<128x64xf32> loc(#loc904) + %p_131 = math.exp2 %p_130 : tensor<128x64xf32> loc(#loc905) + %vT = tt.load %vT_ptrs_110, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1023) + %dp = tt.dot %do, %vT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc907) + %ds = tt.expand_dims %Di_61 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc908) + %ds_132 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc909) + %ds_133 = arith.subf %dp, %ds_132 : tensor<128x64xf32> loc(#loc909) + %ds_134 = arith.mulf %p_131, %ds_133 : tensor<128x64xf32> loc(#loc910) + %grad_scores = arith.select %kT_113, %ds_134, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc911) + %ds_135 = arith.select %tmp38, %grad_scores, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc912) + %ds_136 = arith.truncf %ds_135 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc913) + %dq_137 = tt.trans %kT_114 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc914) + %dq_138 = tt.dot %ds_136, %dq_137, %dq_107, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc915) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc916) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc917) + %cur_block_139 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc918) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc919) + %next_block_140 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_68 : i32 loc(#loc920) + %next_block_141 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc921) + %next_block_142 = tt.load %next_block_141, %next_block_140 evictionPolicy = evict_last : !tt.ptr loc(#loc922) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc923) + %needs_jump_143 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc924) + %needs_jump_144 = arith.cmpi eq, %needs_jump_143, %c0_i32 : i32 loc(#loc925) + %jump_to_block = arith.subi %next_block_142, %cur_block_139 : i32 loc(#loc926) + %jump_to_block_145 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc927) + %jump_to_block_146 = arith.subi %jump_to_block_145, %c64_i32 : i32 loc(#loc928) + %offset = arith.extui %needs_jump_144 : i1 to i32 loc(#loc929) + %offset_147 = arith.muli %jump_to_block_146, %offset : i32 loc(#loc929) + %offset_148 = arith.subi %c1_i32, %offset : i32 loc(#loc930) + %offset_149 = arith.muli %offset_148, %c64_i32 : i32 loc(#loc931) + %offset_150 = arith.addi %offset_147, %offset_149 : i32 loc(#loc932) + %kT_ptrs_151 = arith.muli %offset_150, %c128_i32 : i32 loc(#loc704) + %kT_ptrs_152 = tt.splat %kT_ptrs_151 : i32 -> tensor<128x64xi32> loc(#loc705) + %kT_ptrs_153 = tt.addptr %kT_ptrs_109, %kT_ptrs_152 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc705) + %vT_ptrs_154 = tt.addptr %vT_ptrs_110, %kT_ptrs_152 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc706) + %offs_n2_155 = tt.splat %offset_150 : i32 -> tensor<64xi32> loc(#loc707) + %offs_n2_156 = arith.addi %offs_n2_108, %offs_n2_155 : tensor<64xi32> loc(#loc707) + scf.yield %dq_138, %offs_n2_156, %kT_ptrs_153, %vT_ptrs_154 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc708) + } loc(#loc1029) + %kv_indices_86 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_32 : !tt.ptr, i32 loc(#loc526) + %kv_start_87 = tt.load %kv_indices_86 : !tt.ptr loc(#loc527) + %kv_start_88 = arith.muli %kv_start_87, %c128_i32 : i32 loc(#loc528) + %sparse_kv_num_blocks_89 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_30 : !tt.ptr, i32 loc(#loc529) + %sparse_kv_num_blocks_90 = tt.load %sparse_kv_num_blocks_89 : !tt.ptr loc(#loc530) + %offs_n2_91 = tt.splat %kv_start_88 : i32 -> tensor<64xi32> loc(#loc531) + %offs_n2_92 = arith.addi %offs_n2_91, %offs_n2 : tensor<64xi32> loc(#loc531) + %kT_ptrs_93 = tt.expand_dims %offs_n2_92 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc709) + %kT_ptrs_94 = arith.muli %kT_ptrs_93, %cst_1 : tensor<1x64xi32> loc(#loc710) + %kT_ptrs_95 = tt.addptr %kT_ptrs_72, %kT_ptrs_94 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc711) + %kT_ptrs_96 = tt.broadcast %kT_ptrs_95 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc712) + %kT_ptrs_97 = tt.addptr %kT_ptrs_96, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc712) + %vT_ptrs_98 = tt.addptr %vT_ptrs, %kT_ptrs_94 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc713) + %vT_ptrs_99 = tt.broadcast %vT_ptrs_98 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc714) + %vT_ptrs_100 = tt.addptr %vT_ptrs_99, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc714) + %hi_101 = arith.muli %sparse_kv_num_blocks_90, %c2_i32 : i32 loc(#loc715) + %hi_102 = arith.minsi %hi_101, %hi_83 : i32 loc(#loc716) + %vT_ptrs_103:4 = scf.for %start_n = %c0_i32 to %hi_102 step %c1_i32 iter_args(%dq_107 = %vT_ptrs_85#0, %offs_n2_108 = %offs_n2_92, %kT_ptrs_109 = %kT_ptrs_97, %vT_ptrs_110 = %vT_ptrs_100) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.expand_dims %offs_n2_108 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1024) + %kT_111 = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc1025) + %kT_112 = arith.cmpi slt, %kT, %kT_111 : tensor<1x64xi32> loc(#loc1025) + %kT_113 = tt.broadcast %kT_112 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1026) + %kT_114 = tt.load %kT_ptrs_109, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1026) + %qk = tt.dot %q_52, %kT_114, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc934) + %qk_115 = arith.mulf %qk, %cst_10 : tensor<128x64xf32> loc(#loc935) + %post_mod_scores = arith.select %kT_113, %qk_115, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc936) + %post_mod_scores_116 = arith.mulf %post_mod_scores, %cst_6 : tensor<128x64xf32> loc(#loc937) + %p = tt.broadcast %lse_66 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc938) + %p_117 = arith.subf %post_mod_scores_116, %p : tensor<128x64xf32> loc(#loc938) + %p_118 = math.exp2 %p_117 : tensor<128x64xf32> loc(#loc939) + %vT = tt.load %vT_ptrs_110, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1027) + %dp = tt.dot %do, %vT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc941) + %ds = tt.expand_dims %Di_61 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc942) + %ds_119 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc943) + %ds_120 = arith.subf %dp, %ds_119 : tensor<128x64xf32> loc(#loc943) + %ds_121 = arith.mulf %p_118, %ds_120 : tensor<128x64xf32> loc(#loc944) + %grad_scores = arith.select %kT_113, %ds_121, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc945) + %ds_122 = arith.truncf %grad_scores : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc946) + %dq_123 = tt.trans %kT_114 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc947) + %dq_124 = tt.dot %ds_122, %dq_123, %dq_107, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc948) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc949) + %cur_block = tt.addptr %kv_indices_86, %cur_block_idx : !tt.ptr, i32 loc(#loc950) + %cur_block_125 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc951) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc952) + %next_block_126 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_90 : i32 loc(#loc953) + %next_block_127 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc954) + %next_block_128 = tt.load %next_block_127, %next_block_126 evictionPolicy = evict_last : !tt.ptr loc(#loc955) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc956) + %needs_jump_129 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc957) + %needs_jump_130 = arith.cmpi eq, %needs_jump_129, %c0_i32 : i32 loc(#loc958) + %jump_to_block = arith.subi %next_block_128, %cur_block_125 : i32 loc(#loc959) + %jump_to_block_131 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc960) + %jump_to_block_132 = arith.subi %jump_to_block_131, %c64_i32 : i32 loc(#loc961) + %offset = arith.extui %needs_jump_130 : i1 to i32 loc(#loc962) + %offset_133 = arith.muli %jump_to_block_132, %offset : i32 loc(#loc962) + %offset_134 = arith.subi %c1_i32, %offset : i32 loc(#loc963) + %offset_135 = arith.muli %offset_134, %c64_i32 : i32 loc(#loc964) + %offset_136 = arith.addi %offset_133, %offset_135 : i32 loc(#loc965) + %kT_ptrs_137 = arith.muli %offset_136, %c128_i32 : i32 loc(#loc719) + %kT_ptrs_138 = tt.splat %kT_ptrs_137 : i32 -> tensor<128x64xi32> loc(#loc720) + %kT_ptrs_139 = tt.addptr %kT_ptrs_109, %kT_ptrs_138 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc720) + %vT_ptrs_140 = tt.addptr %vT_ptrs_110, %kT_ptrs_138 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc721) + %offs_n2_141 = tt.splat %offset_136 : i32 -> tensor<64xi32> loc(#loc722) + %offs_n2_142 = arith.addi %offs_n2_108, %offs_n2_141 : tensor<64xi32> loc(#loc722) + scf.yield %dq_124, %offs_n2_142, %kT_ptrs_139, %vT_ptrs_140 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc723) + } loc(#loc1030) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc533) + %dq_ptrs_104 = tt.addptr %dq_ptrs, %ptr_43 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc533) + %dq_ptrs_105 = tt.broadcast %dq_ptrs_104 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc534) + %dq_ptrs_106 = tt.addptr %dq_ptrs_105, %ptr_48 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc534) + %dq = arith.mulf %vT_ptrs_103#0, %cst_17 : tensor<128x128xf32> loc(#loc535) + %12 = arith.cmpi slt, %ptr_46, %cst_16 : tensor<1x128xi32> loc(#loc191) + %13 = tt.broadcast %12 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc192) + %14 = arith.andi %q_51, %13 : tensor<128x128xi1> loc(#loc192) + %15 = arith.truncf %dq : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc193) + tt.store %dq_ptrs_106, %15, %14 : tensor<128x128x!tt.ptr> loc(#loc193) + } else { + %stride_q_idx_h = arith.muli %ks6, %ks7 : i32 loc(#loc536) + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc537) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32> loc(#loc538) + %offs_n1_28 = arith.addi %offs_n1, %offs_k : tensor<128xi32> loc(#loc538) + %ptr = tt.expand_dims %offs_n1_28 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc724) + %ptr_29 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc725) + %ptr_30 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc726) + %ptr_31 = tt.addptr %ptr_30, %ptr_29 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc726) + %ptr_32 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc727) + %ptr_33 = tt.broadcast %ptr_31 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc728) + %ptr_34 = tt.broadcast %ptr_32 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc728) + %ptr_35 = tt.addptr %ptr_33, %ptr_34 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc728) + %k = tt.splat %ks1 : i32 -> tensor<128x1xi32> loc(#loc729) + %k_36 = arith.cmpi slt, %ptr, %k : tensor<128x1xi32> loc(#loc729) + %k_37 = tt.broadcast %k_36 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc730) + %k_38 = tt.load %ptr_35, %k_37, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc730) + %ptr_39 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc731) + %ptr_40 = tt.addptr %ptr_39, %ptr_29 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc731) + %ptr_41 = tt.broadcast %ptr_40 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc732) + %ptr_42 = tt.addptr %ptr_41, %ptr_34 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc732) + %v = tt.load %ptr_42, %k_37, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc733) + %dk:2 = scf.for %off_g = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%dv = %cst_14, %dk_56 = %cst_14) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc542) + %off_hq1_57 = arith.addi %off_hq1, %off_g : i32 loc(#loc543) + %q_adj1 = arith.muli %off_hq1_57, %c128_i32 : i32 loc(#loc544) + %q_adj1_58 = arith.muli %0, %off_zq : i32 loc(#loc545) + %q_adj1_59 = arith.addi %q_adj1, %q_adj1_58 : i32 loc(#loc546) + %q_adj1_60 = arith.extsi %q_adj1_59 : i32 to i64 loc(#loc547) + %do_adj1 = arith.muli %10, %off_hq1_57 : i32 loc(#loc548) + %do_adj1_61 = arith.muli %9, %off_zq : i32 loc(#loc549) + %do_adj1_62 = arith.addi %do_adj1, %do_adj1_61 : i32 loc(#loc550) + %do_adj1_63 = arith.extsi %do_adj1_62 : i32 to i64 loc(#loc551) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc552) + %off_chz1_64 = arith.addi %off_chz1, %off_hq1_57 : i32 loc(#loc553) + %off_chz1_65 = arith.muli %off_chz1_64, %ks0 : i32 loc(#loc554) + %off_chz1_66 = arith.extsi %off_chz1_65 : i32 to i64 loc(#loc555) + %Q1 = tt.addptr %arg_Q, %q_adj1_60 : !tt.ptr, i64 loc(#loc556) + %DO1 = tt.addptr %arg_DO, %do_adj1_63 : !tt.ptr, i64 loc(#loc557) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_66 : !tt.ptr, i64 loc(#loc558) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_66 : !tt.ptr, i64 loc(#loc559) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %ks5 : i32 loc(#loc560) + %sparse_q_num_blks_offset_67 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc561) + %sparse_q_idx_offset = arith.muli %off_zkv, %stride_q_idx_h : i32 loc(#loc562) + %sparse_q_idx_offset_68 = arith.muli %pid, %ks6 : i32 loc(#loc563) + %sparse_q_idx_offset_69 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_68 : i32 loc(#loc564) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_69 : !tt.ptr, i32 loc(#loc565) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc566) + %q_start_70 = arith.muli %q_start, %c128_i32 : i32 loc(#loc567) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_67 : !tt.ptr, i32 loc(#loc568) + %sparse_q_num_blocks_71 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc569) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc570) + %offs_m1_72 = tt.splat %q_start_70 : i32 -> tensor<64xi32> loc(#loc571) + %offs_m1_73 = arith.addi %offs_m1_72, %offs_m1 : tensor<64xi32> loc(#loc571) + %qT_ptrs = tt.expand_dims %offs_m1_73 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc735) + %qT_ptrs_74 = arith.muli %qT_ptrs, %cst_0 : tensor<1x64xi32> loc(#loc736) + %qT_ptrs_75 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc737) + %qT_ptrs_76 = tt.addptr %qT_ptrs_75, %qT_ptrs_74 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc737) + %qT_ptrs_77 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc738) + %qT_ptrs_78 = tt.broadcast %qT_ptrs_76 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc739) + %qT_ptrs_79 = tt.broadcast %qT_ptrs_77 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc739) + %qT_ptrs_80 = tt.addptr %qT_ptrs_78, %qT_ptrs_79 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc739) + %do_ptrs = tt.expand_dims %offs_m1_73 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc740) + %do_ptrs_81 = arith.muli %do_ptrs, %cst : tensor<64x1xi32> loc(#loc741) + %do_ptrs_82 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc742) + %do_ptrs_83 = tt.addptr %do_ptrs_82, %do_ptrs_81 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc742) + %do_ptrs_84 = tt.broadcast %do_ptrs_83 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc743) + %do_ptrs_85 = tt.broadcast %ptr_32 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc743) + %do_ptrs_86 = tt.addptr %do_ptrs_84, %do_ptrs_85 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc743) + %hi = arith.muli %sparse_q_num_blocks_71, %c2_i32 : i32 loc(#loc744) + %hi_87 = arith.addi %ks0, %c63_i32 : i32 loc(#loc966) + %hi_88 = arith.divsi %hi_87, %c64_i32 : i32 loc(#loc967) + %hi_89 = arith.maxsi %hi_88, %c1_i32 : i32 loc(#loc746) + %hi_90 = arith.minsi %hi, %hi_89 : i32 loc(#loc747) + %do_ptrs_91:5 = scf.for %start_m = %c0_i32 to %hi_90 step %c1_i32 iter_args(%dk_112 = %dk_56, %dv_113 = %dv, %offs_m1_114 = %offs_m1_73, %qT_ptrs_115 = %qT_ptrs_80, %do_ptrs_116 = %do_ptrs_86) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.expand_dims %offs_m1_114 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc969) + %qT_117 = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc970) + %qT_118 = arith.cmpi slt, %qT, %qT_117 : tensor<1x64xi32> loc(#loc970) + %qT_119 = tt.broadcast %qT_118 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc971) + %qT_120 = tt.load %qT_ptrs_115, %qT_119, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc971) + %lse = tt.splat %ks0 : i32 -> tensor<64xi32> loc(#loc750) + %lse_121 = arith.cmpi slt, %offs_m1_114, %lse : tensor<64xi32> loc(#loc750) + %lse_122 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc751) + %lse_123 = tt.addptr %lse_122, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc751) + %lse_124 = tt.load %lse_123, %lse_121 : tensor<64x!tt.ptr> loc(#loc752) + %lse_125 = arith.cmpf oeq, %lse_124, %cst_5 : tensor<64xf32> loc(#loc753) + %lse_126 = arith.select %lse_125, %cst_4, %lse_124 : tensor<64xi1>, tensor<64xf32> loc(#loc754) + %qkT = tt.dot %k_38, %qT_120, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc755) + %qkT_127 = arith.mulf %qkT, %cst_10 : tensor<128x64xf32> loc(#loc756) + %m = arith.remsi %qT, %qT_117 : tensor<1x64xi32> loc(#loc972) + %n = arith.remsi %ptr, %k : tensor<128x1xi32> loc(#loc973) + %post_mod_scores = arith.select %qT_119, %qkT_127, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc759) + %tmp44 = tt.broadcast %m : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc760) + %tmp44_128 = tt.broadcast %n : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc760) + %tmp44_129 = arith.cmpi sge, %tmp44, %tmp44_128 : tensor<128x64xi32> loc(#loc760) + %tmp45 = arith.extsi %n : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc761) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc762) + %tmp47_130 = tt.load %tmp47 : !tt.ptr loc(#loc763) + %tmp48 = tt.splat %tmp47_130 : i64 -> tensor<128x1xi64> loc(#loc764) + %tmp48_131 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc764) + %tmp49 = arith.extsi %m : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc765) + %tmp50 = tt.splat %tmp47_130 : i64 -> tensor<1x64xi64> loc(#loc766) + %tmp50_132 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc766) + %tmp51 = tt.broadcast %tmp48_131 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc767) + %tmp51_133 = tt.broadcast %tmp50_132 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc767) + %tmp51_134 = arith.andi %tmp51, %tmp51_133 : tensor<128x64xi1> loc(#loc767) + %tmp52 = arith.andi %tmp44_129, %tmp51_134 : tensor<128x64xi1> loc(#loc768) + %tmp55 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc769) + %tmp55_135 = arith.cmpi sge, %n, %tmp55 : tensor<128x1xi32> loc(#loc769) + %tmp56 = arith.remsi %n, %tmp55 : tensor<128x1xi32> loc(#loc770) + %tmp58 = arith.cmpi ne, %tmp56, %cst_3 : tensor<128x1xi32> loc(#loc771) + %tmp59 = arith.cmpi slt, %tmp56, %cst_3 : tensor<128x1xi32> loc(#loc772) + %tmp60 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc773) + %tmp61 = tt.splat %tmp60 : i1 -> tensor<128x1xi1> loc(#loc774) + %tmp61_136 = arith.cmpi ne, %tmp59, %tmp61 : tensor<128x1xi1> loc(#loc774) + %tmp62 = arith.andi %tmp58, %tmp61_136 : tensor<128x1xi1> loc(#loc775) + %tmp63 = arith.addi %tmp56, %tmp55 : tensor<128x1xi32> loc(#loc776) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc777) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc778) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64> loc(#loc779) + %tmp67 = arith.andi %tmp55_135, %tmp66 : tensor<128x1xi1> loc(#loc780) + %tmp68 = arith.subi %tmp44_128, %tmp44 : tensor<128x64xi32> loc(#loc781) + %tmp69 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc782) + %tmp69_137 = arith.remsi %tmp68, %tmp69 : tensor<128x64xi32> loc(#loc782) + %tmp70 = arith.cmpi ne, %tmp69_137, %cst_7 : tensor<128x64xi32> loc(#loc783) + %tmp71 = arith.cmpi slt, %tmp69_137, %cst_7 : tensor<128x64xi32> loc(#loc784) + %tmp72 = tt.splat %tmp60 : i1 -> tensor<128x64xi1> loc(#loc785) + %tmp72_138 = arith.cmpi ne, %tmp71, %tmp72 : tensor<128x64xi1> loc(#loc785) + %tmp73 = arith.andi %tmp70, %tmp72_138 : tensor<128x64xi1> loc(#loc786) + %tmp74 = arith.addi %tmp69_137, %tmp69 : tensor<128x64xi32> loc(#loc787) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69_137 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc788) + %tmp76 = arith.cmpi eq, %tmp75, %cst_7 : tensor<128x64xi32> loc(#loc789) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc790) + %tmp77_139 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1> loc(#loc790) + %tmp78 = arith.ori %tmp52, %tmp77_139 : tensor<128x64xi1> loc(#loc791) + %post_mod_scores_140 = arith.select %tmp78, %post_mod_scores, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc792) + %post_mod_scores_141 = arith.mulf %post_mod_scores_140, %cst_6 : tensor<128x64xf32> loc(#loc793) + %pT = tt.expand_dims %lse_126 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc794) + %pT_142 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc795) + %pT_143 = arith.subf %post_mod_scores_141, %pT_142 : tensor<128x64xf32> loc(#loc795) + %pT_144 = math.exp2 %pT_143 : tensor<128x64xf32> loc(#loc796) + %do = tt.expand_dims %offs_m1_114 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc974) + %do_145 = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc975) + %do_146 = arith.cmpi slt, %do, %do_145 : tensor<64x1xi32> loc(#loc975) + %do_147 = tt.broadcast %do_146 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc976) + %do_148 = tt.load %do_ptrs_116, %do_147, %cst_2 : tensor<64x128x!tt.ptr> loc(#loc976) + %dv_149 = arith.truncf %pT_144 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc798) + %dv_150 = tt.dot %dv_149, %do_148, %dv_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc799) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc800) + %Di_151 = tt.addptr %Di, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc800) + %Di_152 = tt.load %Di_151, %lse_121 : tensor<64x!tt.ptr> loc(#loc801) + %dpT = tt.trans %do_148 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc802) + %dpT_153 = tt.dot %v, %dpT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc803) + %dsT = tt.expand_dims %Di_152 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc804) + %dsT_154 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc805) + %dsT_155 = arith.subf %dpT_153, %dsT_154 : tensor<128x64xf32> loc(#loc805) + %dsT_156 = arith.mulf %pT_144, %dsT_155 : tensor<128x64xf32> loc(#loc806) + %grad_scores = arith.select %qT_119, %dsT_156, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc807) + %dsT_157 = arith.select %tmp78, %grad_scores, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc808) + %dk_158 = arith.truncf %dsT_157 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc809) + %dk_159 = tt.trans %qT_120 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc810) + %dk_160 = tt.dot %dk_158, %dk_159, %dk_112, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc811) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc977) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc978) + %cur_block_161 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc979) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc980) + %next_block_162 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_71 : i32 loc(#loc981) + %next_block_163 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc982) + %next_block_164 = tt.load %next_block_163, %next_block_162 evictionPolicy = evict_last : !tt.ptr loc(#loc983) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc984) + %needs_jump_165 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc985) + %needs_jump_166 = arith.cmpi eq, %needs_jump_165, %c0_i32 : i32 loc(#loc986) + %jump_to_block = arith.subi %next_block_164, %cur_block_161 : i32 loc(#loc987) + %jump_to_block_167 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc988) + %jump_to_block_168 = arith.subi %jump_to_block_167, %c64_i32 : i32 loc(#loc989) + %offset = arith.extui %needs_jump_166 : i1 to i32 loc(#loc990) + %offset_169 = arith.muli %jump_to_block_168, %offset : i32 loc(#loc990) + %offset_170 = arith.subi %c1_i32, %offset : i32 loc(#loc991) + %offset_171 = arith.muli %offset_170, %c64_i32 : i32 loc(#loc992) + %offset_172 = arith.addi %offset_169, %offset_171 : i32 loc(#loc993) + %qT_ptrs_173 = arith.muli %offset_172, %c4096_i32 : i32 loc(#loc813) + %qT_ptrs_174 = tt.splat %qT_ptrs_173 : i32 -> tensor<128x64xi32> loc(#loc814) + %qT_ptrs_175 = tt.addptr %qT_ptrs_115, %qT_ptrs_174 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc814) + %do_ptrs_176 = arith.muli %offset_172, %c128_i32 : i32 loc(#loc815) + %do_ptrs_177 = tt.splat %do_ptrs_176 : i32 -> tensor<64x128xi32> loc(#loc816) + %do_ptrs_178 = tt.addptr %do_ptrs_116, %do_ptrs_177 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc816) + %offs_m1_179 = tt.splat %offset_172 : i32 -> tensor<64xi32> loc(#loc817) + %offs_m1_180 = arith.addi %offs_m1_114, %offs_m1_179 : tensor<64xi32> loc(#loc817) + scf.yield %dk_160, %dv_150, %offs_m1_180, %qT_ptrs_175, %do_ptrs_178 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc656) + } loc(#loc1032) + %q_indices_92 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_69 : !tt.ptr, i32 loc(#loc657) + %q_start_93 = tt.load %q_indices_92 : !tt.ptr loc(#loc658) + %q_start_94 = arith.muli %q_start_93, %c128_i32 : i32 loc(#loc659) + %sparse_q_num_blocks_95 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_67 : !tt.ptr, i32 loc(#loc660) + %sparse_q_num_blocks_96 = tt.load %sparse_q_num_blocks_95 : !tt.ptr loc(#loc661) + %offs_m1_97 = tt.splat %q_start_94 : i32 -> tensor<64xi32> loc(#loc662) + %offs_m1_98 = arith.addi %offs_m1_97, %offs_m1 : tensor<64xi32> loc(#loc662) + %qT_ptrs_99 = tt.expand_dims %offs_m1_98 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc818) + %qT_ptrs_100 = arith.muli %qT_ptrs_99, %cst_0 : tensor<1x64xi32> loc(#loc819) + %qT_ptrs_101 = tt.addptr %qT_ptrs_75, %qT_ptrs_100 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc820) + %qT_ptrs_102 = tt.broadcast %qT_ptrs_101 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc821) + %qT_ptrs_103 = tt.addptr %qT_ptrs_102, %qT_ptrs_79 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc821) + %do_ptrs_104 = tt.expand_dims %offs_m1_98 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc822) + %do_ptrs_105 = arith.muli %do_ptrs_104, %cst : tensor<64x1xi32> loc(#loc823) + %do_ptrs_106 = tt.addptr %do_ptrs_82, %do_ptrs_105 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc824) + %do_ptrs_107 = tt.broadcast %do_ptrs_106 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc825) + %do_ptrs_108 = tt.addptr %do_ptrs_107, %do_ptrs_85 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc825) + %hi_109 = arith.muli %sparse_q_num_blocks_96, %c2_i32 : i32 loc(#loc826) + %hi_110 = arith.minsi %hi_109, %hi_89 : i32 loc(#loc827) + %do_ptrs_111:5 = scf.for %start_m = %c0_i32 to %hi_110 step %c1_i32 iter_args(%dk_112 = %do_ptrs_91#0, %dv_113 = %do_ptrs_91#1, %offs_m1_114 = %offs_m1_98, %qT_ptrs_115 = %qT_ptrs_103, %do_ptrs_116 = %do_ptrs_108) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.expand_dims %offs_m1_114 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc994) + %qT_117 = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc995) + %qT_118 = arith.cmpi slt, %qT, %qT_117 : tensor<1x64xi32> loc(#loc995) + %qT_119 = tt.broadcast %qT_118 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc996) + %qT_120 = tt.load %qT_ptrs_115, %qT_119, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc996) + %lse = tt.splat %ks0 : i32 -> tensor<64xi32> loc(#loc829) + %lse_121 = arith.cmpi slt, %offs_m1_114, %lse : tensor<64xi32> loc(#loc829) + %lse_122 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc830) + %lse_123 = tt.addptr %lse_122, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc830) + %lse_124 = tt.load %lse_123, %lse_121 : tensor<64x!tt.ptr> loc(#loc831) + %lse_125 = arith.cmpf oeq, %lse_124, %cst_5 : tensor<64xf32> loc(#loc832) + %lse_126 = arith.select %lse_125, %cst_4, %lse_124 : tensor<64xi1>, tensor<64xf32> loc(#loc833) + %qkT = tt.dot %k_38, %qT_120, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc834) + %qkT_127 = arith.mulf %qkT, %cst_10 : tensor<128x64xf32> loc(#loc835) + %post_mod_scores = arith.select %qT_119, %qkT_127, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc836) + %post_mod_scores_128 = arith.mulf %post_mod_scores, %cst_6 : tensor<128x64xf32> loc(#loc837) + %pT = tt.expand_dims %lse_126 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc838) + %pT_129 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc839) + %pT_130 = arith.subf %post_mod_scores_128, %pT_129 : tensor<128x64xf32> loc(#loc839) + %pT_131 = math.exp2 %pT_130 : tensor<128x64xf32> loc(#loc840) + %do = tt.expand_dims %offs_m1_114 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc997) + %do_132 = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc998) + %do_133 = arith.cmpi slt, %do, %do_132 : tensor<64x1xi32> loc(#loc998) + %do_134 = tt.broadcast %do_133 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc999) + %do_135 = tt.load %do_ptrs_116, %do_134, %cst_2 : tensor<64x128x!tt.ptr> loc(#loc999) + %dv_136 = arith.truncf %pT_131 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc842) + %dv_137 = tt.dot %dv_136, %do_135, %dv_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc843) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc844) + %Di_138 = tt.addptr %Di, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc844) + %Di_139 = tt.load %Di_138, %lse_121 : tensor<64x!tt.ptr> loc(#loc845) + %dpT = tt.trans %do_135 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc846) + %dpT_140 = tt.dot %v, %dpT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc847) + %dsT = tt.expand_dims %Di_139 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc848) + %dsT_141 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc849) + %dsT_142 = arith.subf %dpT_140, %dsT_141 : tensor<128x64xf32> loc(#loc849) + %dsT_143 = arith.mulf %pT_131, %dsT_142 : tensor<128x64xf32> loc(#loc850) + %grad_scores = arith.select %qT_119, %dsT_143, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc851) + %dk_144 = arith.truncf %grad_scores : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc852) + %dk_145 = tt.trans %qT_120 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc853) + %dk_146 = tt.dot %dk_144, %dk_145, %dk_112, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc854) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc1000) + %cur_block = tt.addptr %q_indices_92, %cur_block_idx : !tt.ptr, i32 loc(#loc1001) + %cur_block_147 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc1002) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc1003) + %next_block_148 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_96 : i32 loc(#loc1004) + %next_block_149 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc1005) + %next_block_150 = tt.load %next_block_149, %next_block_148 evictionPolicy = evict_last : !tt.ptr loc(#loc1006) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc1007) + %needs_jump_151 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc1008) + %needs_jump_152 = arith.cmpi eq, %needs_jump_151, %c0_i32 : i32 loc(#loc1009) + %jump_to_block = arith.subi %next_block_150, %cur_block_147 : i32 loc(#loc1010) + %jump_to_block_153 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc1011) + %jump_to_block_154 = arith.subi %jump_to_block_153, %c64_i32 : i32 loc(#loc1012) + %offset = arith.extui %needs_jump_152 : i1 to i32 loc(#loc1013) + %offset_155 = arith.muli %jump_to_block_154, %offset : i32 loc(#loc1013) + %offset_156 = arith.subi %c1_i32, %offset : i32 loc(#loc1014) + %offset_157 = arith.muli %offset_156, %c64_i32 : i32 loc(#loc1015) + %offset_158 = arith.addi %offset_155, %offset_157 : i32 loc(#loc1016) + %qT_ptrs_159 = arith.muli %offset_158, %c4096_i32 : i32 loc(#loc856) + %qT_ptrs_160 = tt.splat %qT_ptrs_159 : i32 -> tensor<128x64xi32> loc(#loc857) + %qT_ptrs_161 = tt.addptr %qT_ptrs_115, %qT_ptrs_160 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc857) + %do_ptrs_162 = arith.muli %offset_158, %c128_i32 : i32 loc(#loc858) + %do_ptrs_163 = tt.splat %do_ptrs_162 : i32 -> tensor<64x128xi32> loc(#loc859) + %do_ptrs_164 = tt.addptr %do_ptrs_116, %do_ptrs_163 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc859) + %offs_m1_165 = tt.splat %offset_158 : i32 -> tensor<64xi32> loc(#loc860) + %offs_m1_166 = arith.addi %offs_m1_114, %offs_m1_165 : tensor<64xi32> loc(#loc860) + scf.yield %dk_146, %dv_137, %offs_m1_166, %qT_ptrs_161, %do_ptrs_164 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc664) + } loc(#loc1033) + scf.yield %do_ptrs_111#1, %do_ptrs_111#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc324) + } loc(#loc734) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc665) + %dv_ptrs_43 = tt.addptr %dv_ptrs, %ptr_29 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc665) + %dv_ptrs_44 = tt.broadcast %dv_ptrs_43 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc666) + %dv_ptrs_45 = tt.addptr %dv_ptrs_44, %ptr_34 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc666) + %12 = arith.cmpi slt, %ptr_32, %cst_16 : tensor<1x128xi32> loc(#loc327) + %13 = tt.broadcast %12 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc328) + %14 = arith.andi %k_37, %13 : tensor<128x128xi1> loc(#loc328) + %15 = arith.truncf %dk#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc329) + tt.store %dv_ptrs_45, %15, %14 : tensor<128x128x!tt.ptr> loc(#loc329) + %dk_46 = arith.mulf %dk#1, %cst_17 : tensor<128x128xf32> loc(#loc667) + %xindex = tt.broadcast %ptr_29 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc668) + %xindex_47 = arith.addi %ptr_34, %xindex : tensor<128x128xi32> loc(#loc668) + %xindex_48 = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc669) + %xindex_49 = arith.muli %xindex_48, %ks1 : i32 loc(#loc670) + %xindex_50 = tt.splat %xindex_49 : i32 -> tensor<128x128xi32> loc(#loc671) + %xindex_51 = arith.addi %xindex_47, %xindex_50 : tensor<128x128xi32> loc(#loc671) + %xindex_52 = arith.muli %off_zq, %c1024_i32 : i32 loc(#loc672) + %xindex_53 = arith.muli %xindex_52, %ks1 : i32 loc(#loc673) + %xindex_54 = tt.splat %xindex_53 : i32 -> tensor<128x128xi32> loc(#loc674) + %xindex_55 = arith.addi %xindex_51, %xindex_54 : tensor<128x128xi32> loc(#loc674) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc338) + %17 = tt.addptr %16, %xindex_55 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc338) + %18 = arith.truncf %dk_46 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc339) + tt.store %17, %18, %k_37 : tensor<128x128x!tt.ptr> loc(#loc339) + } loc(#loc33) + tt.return loc(#loc340) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":103:9) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":94:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:54) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":95:63) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:74) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:66) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:100) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:91) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:82) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":97:111) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":111:24) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":112:36) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":113:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":115:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":116:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":117:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":124:59) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:50) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:37) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":128:61) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":131:9) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":132:9) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":133:10) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":136:26) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:14) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:7) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":140:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:29) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":144:44) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":145:35) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":148:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":154:55) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":154:78) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:50) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:83) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":155:68) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:30) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:52) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:40) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":158:63) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:55) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:42) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":159:66) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:30) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:46) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":161:56) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":163:17) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":164:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":167:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":168:21) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":169:25) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":174:36) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":175:29) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:27) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":178:107) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:20) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:56) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":825:49) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:52) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:23) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":179:111) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:58) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:34) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":188:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:33) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":189:26) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":190:30) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":190:50) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":191:18) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":195:30) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":196:27) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":196:41) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":197:53) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":197:39) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":199:42) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":199:29) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:26) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":207:12) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:37) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:18) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:56) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":390:49) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:18) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":391:49) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:43) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:90) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:101) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":395:63) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":397:28) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:41) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":458:105) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":405:12) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:52) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":831:23) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":459:19) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":461:14) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":798:21) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":464:46) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":467:46) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":476:79) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":482:23) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":483:23) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":485:34) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":485:23) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":486:22) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":487:23) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":488:23) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":489:23) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":490:23) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":493:24) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":494:24) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":496:25) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":497:92) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":498:92) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":499:25) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":500:24) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":501:24) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":502:39) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":503:25) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":504:24) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":505:24) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":506:23) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":507:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":508:25) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":509:92) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":510:25) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":511:24) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":512:24) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":513:39) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":514:25) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":515:24) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":516:24) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":521:69) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":524:27) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":525:39) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":525:21) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":528:104) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":530:20) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:22) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:19) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":531:14) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":538:71) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":549:43) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":551:15) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:30) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":553:21) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":788:33) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":411:64) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":789:38) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":789:24) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:109) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:113) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:55) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":790:25) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:30) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:35) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":791:60) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:34) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:48) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":792:63) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:29) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:47) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:61) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":793:42) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":414:28) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":414:19) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":415:19) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":417:19) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":417:8) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":214:39) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":215:31) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":215:45) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":216:62) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":216:43) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":218:33) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":226:16) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:24) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":231:56) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":232:14) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:87) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:69) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":236:30) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":245:29) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":252:25) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":253:29) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":256:107) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":257:107) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":262:30) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":263:32) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":263:51) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:34) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:56) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:44) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":266:67) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:36) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:59) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:46) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":267:70) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:34) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:39) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:50) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":269:60) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":271:21) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":272:23) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":275:25) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":276:29) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":281:58) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":281:80) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:53) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:81) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":282:70) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":286:32) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":287:30) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":287:43) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":288:55) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":288:42) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":290:45) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":290:32) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:26) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":298:16) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:37) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:18) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:56) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":601:49) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:27) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:38) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:19) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":602:51) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:42) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:87) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:98) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":608:61) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":610:28) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":669:105) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":618:12) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:52) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:28) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":674:22) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":675:26) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":675:46) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":676:20) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":678:15) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":680:46) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":683:46) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":692:78) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":698:25) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":699:25) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":701:35) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":701:24) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":702:24) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":703:25) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":704:24) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":705:24) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":706:24) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":709:25) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":710:25) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":712:25) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":713:92) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":714:92) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":715:25) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":716:24) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":717:24) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":718:39) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":719:25) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":720:24) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":721:24) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":722:24) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":723:25) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":724:25) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":725:92) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":726:25) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":727:24) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":728:24) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":729:39) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":730:25) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":731:24) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":732:24) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":736:69) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":739:27) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:44) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:40) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":740:22) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":833:41) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":741:99) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:24) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":744:43) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:29) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":748:21) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":750:29) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":750:20) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:25) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:22) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":751:16) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":759:70) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":773:45) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:24) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:52) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":775:43) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":623:62) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":626:28) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":626:19) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":627:28) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":627:19) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":628:19) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":628:8) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":306:41) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":307:34) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":307:47) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":308:64) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":308:46) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":310:36) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":318:20) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":303:12) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:23) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":323:55) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:71) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:61) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":332:30) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":334:14) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:27) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:45) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:53) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:41) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:64) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:71) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":344:59) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":345:29) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":345:69) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/tv/ctvn756awhqumwxc7nh6h7s3zs4orgewod2ud6h3ic4vw5kk5m4e.py":139:4) +#loc368 = loc("HQ"(#loc2)) +#loc369 = loc("pid"(#loc13)) +#loc370 = loc("NUM_KV_BLOCKS"(#loc15)) +#loc371 = loc("NUM_Q_BLOCKS"(#loc17)) +#loc372 = loc("off_zq"(#loc18)) +#loc373 = loc("off_hkv"(#loc19)) +#loc374 = loc("off_zkv"(#loc20)) +#loc375 = loc("k_adj"(#loc21)) +#loc376 = loc("k_adj"(#loc22)) +#loc377 = loc("k_adj"(#loc23)) +#loc378 = loc("k_adj"(#loc24)) +#loc379 = loc("dv_adj"(#loc25)) +#loc380 = loc("dv_adj"(#loc26)) +#loc381 = loc("dv_adj"(#loc27)) +#loc382 = loc("K"(#loc28)) +#loc383 = loc("V"(#loc29)) +#loc384 = loc("DV"(#loc30)) +#loc385 = loc("offs_k"(#loc31)) +#loc386 = loc("off_pid"(#loc34)) +#loc387 = loc("off_hq2"(#loc35)) +#loc388 = loc("off_hq2"(#loc36)) +#loc389 = loc("off_hq2"(#loc37)) +#loc390 = loc("start_m2_block"(#loc38)) +#loc391 = loc("stride_kv_idx_h"(#loc39)) +#loc392 = loc("sparse_kv_num_blks_offset"(#loc40)) +#loc393 = loc("sparse_kv_num_blks_offset"(#loc41)) +#loc394 = loc("sparse_kv_idx_offset"(#loc42)) +#loc395 = loc("sparse_kv_idx_offset"(#loc43)) +#loc396 = loc("sparse_kv_idx_offset"(#loc44)) +#loc397 = loc("q_adj2"(#loc45)) +#loc398 = loc("q_adj2"(#loc46)) +#loc399 = loc("q_adj2"(#loc47)) +#loc400 = loc("q_adj2"(#loc48)) +#loc401 = loc("do_adj2"(#loc49)) +#loc402 = loc("do_adj2"(#loc50)) +#loc403 = loc("do_adj2"(#loc51)) +#loc404 = loc("do_adj2"(#loc52)) +#loc405 = loc("off_chz2"(#loc53)) +#loc406 = loc("off_chz2"(#loc54)) +#loc407 = loc("off_chz2"(#loc55)) +#loc408 = loc("off_chz2"(#loc56)) +#loc409 = loc("Q2"(#loc57)) +#loc410 = loc("DO2"(#loc58)) +#loc411 = loc("DQ2"(#loc59)) +#loc412 = loc("LSE2"(#loc60)) +#loc413 = loc("DELTA2"(#loc61)) +#loc414 = loc("start_m2"(#loc62)) +#loc415 = loc("offs_m2"(#loc63)) +#loc416 = loc("ptr"(#loc64)) +#loc417 = loc("q"(#loc65)) +#loc418 = loc("ptr"(#loc66)) +#loc419 = loc("ptr"(#loc67)) +#loc420 = loc("ptr"(#loc68)) +#loc421 = loc("ptr"(#loc69)) +#loc422 = loc("do"(#loc72)) +#loc423 = loc("Di"(#loc73)) +#loc424 = loc("Di"(#loc74)) +#loc425 = loc("Di"(#loc75)) +#loc426 = loc("lse"(#loc76)) +#loc427 = loc("lse"(#loc77)) +#loc428 = loc("lse"(#loc78)) +#loc429 = loc("lse"(#loc79)) +#loc430 = loc("lse"(#loc80)) +#loc431 = loc("kv_indices"(#loc81)) +#loc432 = loc("kv_start"(#loc82)) +#loc433 = loc("kv_start"(#loc83)) +#loc434 = loc("sparse_kv_num_blocks"(#loc84)) +#loc435 = loc("sparse_kv_num_blocks"(#loc85)) +#loc436 = loc("offs_n2"(#loc86)) +#loc437 = loc("offs_n2"(#loc87)) +#loc438 = loc("kT_ptrs"(#loc88)) +#loc439 = loc("dq"(#loc89)) +#loc440 = loc("kT_ptrs"(#loc90)) +#loc441 = loc("kT_ptrs"(#loc91)) +#loc442 = loc("kT_ptrs"(#loc92)) +#loc443 = loc("kT_ptrs"(#loc93)) +#loc444 = loc("vT_ptrs"(#loc94)) +#loc445 = loc("vT_ptrs"(#loc95)) +#loc446 = loc("hi"(#loc96)) +#loc447 = loc("hi"(#loc97)) +#loc448 = loc("hi"(#loc98)) +#loc449 = loc("hi"(#loc99)) +#loc450 = loc("dq"(#loc100)) +#loc451 = loc("kT"(#loc102)) +#loc452 = loc("dq"(#loc103)) +#loc453 = loc("qk"(#loc106)) +#loc454 = loc("qk"(#loc107)) +#loc455 = loc("n"(#loc109)) +#loc456 = loc("m"(#loc110)) +#loc457 = loc("post_mod_scores"(#loc111)) +#loc458 = loc("tmp4"(#loc112)) +#loc459 = loc("tmp5"(#loc113)) +#loc460 = loc("tmp7"(#loc114)) +#loc461 = loc("tmp7"(#loc115)) +#loc462 = loc("tmp8"(#loc116)) +#loc463 = loc("tmp9"(#loc117)) +#loc464 = loc("tmp10"(#loc118)) +#loc465 = loc("tmp11"(#loc119)) +#loc466 = loc("tmp12"(#loc120)) +#loc467 = loc("tmp15"(#loc121)) +#loc468 = loc("tmp16"(#loc122)) +#loc469 = loc("tmp18"(#loc123)) +#loc470 = loc("tmp19"(#loc124)) +#loc471 = loc("tmp20"(#loc125)) +#loc472 = loc("tmp21"(#loc126)) +#loc473 = loc("tmp22"(#loc127)) +#loc474 = loc("tmp23"(#loc128)) +#loc475 = loc("tmp24"(#loc129)) +#loc476 = loc("tmp25"(#loc130)) +#loc477 = loc("tmp26"(#loc131)) +#loc478 = loc("tmp27"(#loc132)) +#loc479 = loc("tmp28"(#loc133)) +#loc480 = loc("tmp29"(#loc134)) +#loc481 = loc("tmp30"(#loc135)) +#loc482 = loc("tmp31"(#loc136)) +#loc483 = loc("tmp32"(#loc137)) +#loc484 = loc("tmp33"(#loc138)) +#loc485 = loc("tmp34"(#loc139)) +#loc486 = loc("tmp35"(#loc140)) +#loc487 = loc("tmp36"(#loc141)) +#loc488 = loc("tmp37"(#loc142)) +#loc489 = loc("tmp38"(#loc143)) +#loc490 = loc("post_mod_scores"(#loc144)) +#loc491 = loc("post_mod_scores"(#loc145)) +#loc492 = loc("p"(#loc146)) +#loc493 = loc("p"(#loc147)) +#loc494 = loc("vT"(#loc148)) +#loc495 = loc("dp"(#loc149)) +#loc496 = loc("ds"(#loc150)) +#loc497 = loc("ds"(#loc151)) +#loc498 = loc("ds"(#loc152)) +#loc499 = loc("grad_scores"(#loc153)) +#loc500 = loc("ds"(#loc154)) +#loc501 = loc("ds"(#loc155)) +#loc502 = loc("dq"(#loc156)) +#loc503 = loc("dq"(#loc157)) +#loc504 = loc("cur_block_idx"(#loc158)) +#loc505 = loc("offset"(#loc159)) +#loc506 = loc("cur_block"(#loc160)) +#loc507 = loc("cur_block"(#loc161)) +#loc508 = loc("next_block"(#loc162)) +#loc509 = loc("next_block"(#loc163)) +#loc510 = loc("next_block"(#loc164)) +#loc511 = loc("next_block"(#loc165)) +#loc512 = loc("needs_jump"(#loc166)) +#loc513 = loc("needs_jump"(#loc167)) +#loc514 = loc("needs_jump"(#loc168)) +#loc515 = loc("jump_to_block"(#loc169)) +#loc516 = loc("jump_to_block"(#loc170)) +#loc517 = loc("jump_to_block"(#loc171)) +#loc518 = loc("offset"(#loc172)) +#loc519 = loc("offset"(#loc173)) +#loc520 = loc("offset"(#loc174)) +#loc521 = loc("offset"(#loc175)) +#loc522 = loc("kT_ptrs"(#loc176)) +#loc523 = loc("kT_ptrs"(#loc177)) +#loc524 = loc("vT_ptrs"(#loc178)) +#loc525 = loc("offs_n2"(#loc179)) +#loc526 = loc("kv_indices"(#loc181)) +#loc527 = loc("kv_start"(#loc182)) +#loc528 = loc("kv_start"(#loc183)) +#loc529 = loc("sparse_kv_num_blocks"(#loc184)) +#loc530 = loc("sparse_kv_num_blocks"(#loc185)) +#loc531 = loc("offs_n2"(#loc186)) +#loc532 = loc("dq"(#loc187)) +#loc533 = loc("dq_ptrs"(#loc188)) +#loc534 = loc("dq_ptrs"(#loc189)) +#loc535 = loc("dq"(#loc190)) +#loc536 = loc("stride_q_idx_h"(#loc194)) +#loc537 = loc("start_n1"(#loc195)) +#loc538 = loc("offs_n1"(#loc196)) +#loc539 = loc("k"(#loc197)) +#loc540 = loc("v"(#loc198)) +#loc541 = loc("dv"(#loc199)) +#loc542 = loc("off_hq1"(#loc200)) +#loc543 = loc("off_hq1"(#loc201)) +#loc544 = loc("q_adj1"(#loc202)) +#loc545 = loc("q_adj1"(#loc203)) +#loc546 = loc("q_adj1"(#loc204)) +#loc547 = loc("q_adj1"(#loc205)) +#loc548 = loc("do_adj1"(#loc206)) +#loc549 = loc("do_adj1"(#loc207)) +#loc550 = loc("do_adj1"(#loc208)) +#loc551 = loc("do_adj1"(#loc209)) +#loc552 = loc("off_chz1"(#loc210)) +#loc553 = loc("off_chz1"(#loc211)) +#loc554 = loc("off_chz1"(#loc212)) +#loc555 = loc("off_chz1"(#loc213)) +#loc556 = loc("Q1"(#loc214)) +#loc557 = loc("DO1"(#loc215)) +#loc558 = loc("LSE1"(#loc216)) +#loc559 = loc("DELTA1"(#loc217)) +#loc560 = loc("sparse_q_num_blks_offset"(#loc218)) +#loc561 = loc("sparse_q_num_blks_offset"(#loc219)) +#loc562 = loc("sparse_q_idx_offset"(#loc220)) +#loc563 = loc("sparse_q_idx_offset"(#loc221)) +#loc564 = loc("sparse_q_idx_offset"(#loc222)) +#loc565 = loc("q_indices"(#loc223)) +#loc566 = loc("q_start"(#loc224)) +#loc567 = loc("q_start"(#loc225)) +#loc568 = loc("sparse_q_num_blocks"(#loc226)) +#loc569 = loc("sparse_q_num_blocks"(#loc227)) +#loc570 = loc("offs_m1"(#loc228)) +#loc571 = loc("offs_m1"(#loc229)) +#loc572 = loc("qT_ptrs"(#loc230)) +#loc573 = loc("qT_ptrs"(#loc232)) +#loc574 = loc("qT_ptrs"(#loc233)) +#loc575 = loc("qT_ptrs"(#loc234)) +#loc576 = loc("qT_ptrs"(#loc235)) +#loc577 = loc("do_ptrs"(#loc236)) +#loc578 = loc("do_ptrs"(#loc237)) +#loc579 = loc("do_ptrs"(#loc238)) +#loc580 = loc("do_ptrs"(#loc239)) +#loc581 = loc("hi"(#loc240)) +#loc582 = loc("hi"(#loc241)) +#loc583 = loc("hi"(#loc242)) +#loc584 = loc("hi"(#loc243)) +#loc585 = loc("dk"(#loc244)) +#loc586 = loc("qT"(#loc245)) +#loc587 = loc(callsite(#loc246 at #loc231)) +#loc588 = loc("lse"(#loc247)) +#loc589 = loc("lse"(#loc248)) +#loc590 = loc("lse"(#loc249)) +#loc591 = loc("lse"(#loc250)) +#loc592 = loc("lse"(#loc251)) +#loc593 = loc("qkT"(#loc252)) +#loc594 = loc("qkT"(#loc253)) +#loc595 = loc("m"(#loc254)) +#loc596 = loc("n"(#loc255)) +#loc597 = loc("post_mod_scores"(#loc256)) +#loc598 = loc("tmp44"(#loc257)) +#loc599 = loc("tmp45"(#loc258)) +#loc600 = loc("tmp47"(#loc259)) +#loc601 = loc("tmp47"(#loc260)) +#loc602 = loc("tmp48"(#loc261)) +#loc603 = loc("tmp49"(#loc262)) +#loc604 = loc("tmp50"(#loc263)) +#loc605 = loc("tmp51"(#loc264)) +#loc606 = loc("tmp52"(#loc265)) +#loc607 = loc("tmp55"(#loc266)) +#loc608 = loc("tmp56"(#loc267)) +#loc609 = loc("tmp58"(#loc268)) +#loc610 = loc("tmp59"(#loc269)) +#loc611 = loc("tmp60"(#loc270)) +#loc612 = loc("tmp61"(#loc271)) +#loc613 = loc("tmp62"(#loc272)) +#loc614 = loc("tmp63"(#loc273)) +#loc615 = loc("tmp64"(#loc274)) +#loc616 = loc("tmp65"(#loc275)) +#loc617 = loc("tmp66"(#loc276)) +#loc618 = loc("tmp67"(#loc277)) +#loc619 = loc("tmp68"(#loc278)) +#loc620 = loc("tmp69"(#loc279)) +#loc621 = loc("tmp70"(#loc280)) +#loc622 = loc("tmp71"(#loc281)) +#loc623 = loc("tmp72"(#loc282)) +#loc624 = loc("tmp73"(#loc283)) +#loc625 = loc("tmp74"(#loc284)) +#loc626 = loc("tmp75"(#loc285)) +#loc627 = loc("tmp76"(#loc286)) +#loc628 = loc("tmp77"(#loc287)) +#loc629 = loc("tmp78"(#loc288)) +#loc630 = loc("post_mod_scores"(#loc289)) +#loc631 = loc("post_mod_scores"(#loc290)) +#loc632 = loc("pT"(#loc291)) +#loc633 = loc("pT"(#loc292)) +#loc634 = loc("pT"(#loc293)) +#loc635 = loc("do"(#loc295)) +#loc636 = loc("dv"(#loc296)) +#loc637 = loc("dv"(#loc297)) +#loc638 = loc("Di"(#loc298)) +#loc639 = loc("Di"(#loc299)) +#loc640 = loc("dpT"(#loc300)) +#loc641 = loc("dpT"(#loc301)) +#loc642 = loc("dsT"(#loc302)) +#loc643 = loc("dsT"(#loc303)) +#loc644 = loc("dsT"(#loc304)) +#loc645 = loc("grad_scores"(#loc305)) +#loc646 = loc("dsT"(#loc306)) +#loc647 = loc("dk"(#loc307)) +#loc648 = loc("dk"(#loc308)) +#loc649 = loc("dk"(#loc309)) +#loc650 = loc("offset"(#loc310)) +#loc651 = loc("qT_ptrs"(#loc311)) +#loc652 = loc("qT_ptrs"(#loc312)) +#loc653 = loc("do_ptrs"(#loc313)) +#loc654 = loc("do_ptrs"(#loc314)) +#loc655 = loc("offs_m1"(#loc315)) +#loc656 = loc(callsite(#loc316 at #loc231)) +#loc657 = loc("q_indices"(#loc317)) +#loc658 = loc("q_start"(#loc318)) +#loc659 = loc("q_start"(#loc319)) +#loc660 = loc("sparse_q_num_blocks"(#loc320)) +#loc661 = loc("sparse_q_num_blocks"(#loc321)) +#loc662 = loc("offs_m1"(#loc322)) +#loc663 = loc(callsite(#loc246 at #loc323)) +#loc664 = loc(callsite(#loc316 at #loc323)) +#loc665 = loc("dv_ptrs"(#loc325)) +#loc666 = loc("dv_ptrs"(#loc326)) +#loc667 = loc("dk"(#loc330)) +#loc668 = loc("xindex"(#loc331)) +#loc669 = loc("xindex"(#loc332)) +#loc670 = loc("xindex"(#loc333)) +#loc671 = loc("xindex"(#loc334)) +#loc672 = loc("xindex"(#loc335)) +#loc673 = loc("xindex"(#loc336)) +#loc674 = loc("xindex"(#loc337)) +#loc675 = loc(callsite(#loc14 at #loc370)) +#loc676 = loc(callsite(#loc16 at #loc370)) +#loc677 = loc(callsite(#loc14 at #loc371)) +#loc678 = loc(callsite(#loc16 at #loc371)) +#loc679 = loc(callsite(#loc416 at #loc417)) +#loc680 = loc(callsite(#loc418 at #loc417)) +#loc681 = loc(callsite(#loc419 at #loc417)) +#loc682 = loc(callsite(#loc420 at #loc417)) +#loc683 = loc(callsite(#loc421 at #loc417)) +#loc684 = loc(callsite(#loc70 at #loc417)) +#loc685 = loc(callsite(#loc71 at #loc417)) +#loc686 = loc(callsite(#loc418 at #loc422)) +#loc687 = loc(callsite(#loc419 at #loc422)) +#loc688 = loc(callsite(#loc421 at #loc422)) +#loc689 = loc(callsite(#loc71 at #loc422)) +#loc690 = loc(callsite(#loc438 at #loc439)) +#loc691 = loc(callsite(#loc440 at #loc439)) +#loc692 = loc(callsite(#loc441 at #loc439)) +#loc693 = loc(callsite(#loc442 at #loc439)) +#loc694 = loc(callsite(#loc443 at #loc439)) +#loc695 = loc(callsite(#loc444 at #loc439)) +#loc696 = loc(callsite(#loc445 at #loc439)) +#loc697 = loc(callsite(#loc446 at #loc439)) +#loc698 = loc(callsite(#loc447 at #loc439)) +#loc699 = loc(callsite(#loc448 at #loc439)) +#loc700 = loc(callsite(#loc449 at #loc439)) +#loc701 = loc("offs_n2"(#loc450)) +#loc702 = loc(callsite(#loc452 at #loc439)) +#loc703 = loc(callsite(#loc505 at #loc439)) +#loc704 = loc(callsite(#loc522 at #loc439)) +#loc705 = loc(callsite(#loc523 at #loc439)) +#loc706 = loc(callsite(#loc524 at #loc439)) +#loc707 = loc(callsite(#loc525 at #loc439)) +#loc708 = loc(callsite(#loc180 at #loc439)) +#loc709 = loc(callsite(#loc438 at #loc532)) +#loc710 = loc(callsite(#loc440 at #loc532)) +#loc711 = loc(callsite(#loc441 at #loc532)) +#loc712 = loc(callsite(#loc443 at #loc532)) +#loc713 = loc(callsite(#loc444 at #loc532)) +#loc714 = loc(callsite(#loc445 at #loc532)) +#loc715 = loc(callsite(#loc446 at #loc532)) +#loc716 = loc(callsite(#loc449 at #loc532)) +#loc717 = loc(callsite(#loc452 at #loc532)) +#loc718 = loc(callsite(#loc505 at #loc532)) +#loc719 = loc(callsite(#loc522 at #loc532)) +#loc720 = loc(callsite(#loc523 at #loc532)) +#loc721 = loc(callsite(#loc524 at #loc532)) +#loc722 = loc(callsite(#loc525 at #loc532)) +#loc723 = loc(callsite(#loc180 at #loc532)) +#loc724 = loc(callsite(#loc416 at #loc539)) +#loc725 = loc(callsite(#loc418 at #loc539)) +#loc726 = loc(callsite(#loc419 at #loc539)) +#loc727 = loc(callsite(#loc420 at #loc539)) +#loc728 = loc(callsite(#loc421 at #loc539)) +#loc729 = loc(callsite(#loc70 at #loc539)) +#loc730 = loc(callsite(#loc71 at #loc539)) +#loc731 = loc(callsite(#loc419 at #loc540)) +#loc732 = loc(callsite(#loc421 at #loc540)) +#loc733 = loc(callsite(#loc71 at #loc540)) +#loc734 = loc("dk"(#loc541)) +#loc735 = loc(callsite(#loc572 at #loc231)) +#loc736 = loc(callsite(#loc573 at #loc231)) +#loc737 = loc(callsite(#loc574 at #loc231)) +#loc738 = loc(callsite(#loc575 at #loc231)) +#loc739 = loc(callsite(#loc576 at #loc231)) +#loc740 = loc(callsite(#loc577 at #loc231)) +#loc741 = loc(callsite(#loc578 at #loc231)) +#loc742 = loc(callsite(#loc579 at #loc231)) +#loc743 = loc(callsite(#loc580 at #loc231)) +#loc744 = loc(callsite(#loc581 at #loc231)) +#loc745 = loc(callsite(#loc582 at #loc231)) +#loc746 = loc(callsite(#loc583 at #loc231)) +#loc747 = loc(callsite(#loc584 at #loc231)) +#loc748 = loc("dv"(#loc585)) +#loc749 = loc(callsite(#loc586 at #loc587)) +#loc750 = loc(callsite(#loc588 at #loc587)) +#loc751 = loc(callsite(#loc589 at #loc587)) +#loc752 = loc(callsite(#loc590 at #loc587)) +#loc753 = loc(callsite(#loc591 at #loc587)) +#loc754 = loc(callsite(#loc592 at #loc587)) +#loc755 = loc(callsite(#loc593 at #loc587)) +#loc756 = loc(callsite(#loc594 at #loc587)) +#loc757 = loc(callsite(#loc595 at #loc587)) +#loc758 = loc(callsite(#loc596 at #loc587)) +#loc759 = loc(callsite(#loc597 at #loc587)) +#loc760 = loc(callsite(#loc598 at #loc587)) +#loc761 = loc(callsite(#loc599 at #loc587)) +#loc762 = loc(callsite(#loc600 at #loc587)) +#loc763 = loc(callsite(#loc601 at #loc587)) +#loc764 = loc(callsite(#loc602 at #loc587)) +#loc765 = loc(callsite(#loc603 at #loc587)) +#loc766 = loc(callsite(#loc604 at #loc587)) +#loc767 = loc(callsite(#loc605 at #loc587)) +#loc768 = loc(callsite(#loc606 at #loc587)) +#loc769 = loc(callsite(#loc607 at #loc587)) +#loc770 = loc(callsite(#loc608 at #loc587)) +#loc771 = loc(callsite(#loc609 at #loc587)) +#loc772 = loc(callsite(#loc610 at #loc587)) +#loc773 = loc(callsite(#loc611 at #loc587)) +#loc774 = loc(callsite(#loc612 at #loc587)) +#loc775 = loc(callsite(#loc613 at #loc587)) +#loc776 = loc(callsite(#loc614 at #loc587)) +#loc777 = loc(callsite(#loc615 at #loc587)) +#loc778 = loc(callsite(#loc616 at #loc587)) +#loc779 = loc(callsite(#loc617 at #loc587)) +#loc780 = loc(callsite(#loc618 at #loc587)) +#loc781 = loc(callsite(#loc619 at #loc587)) +#loc782 = loc(callsite(#loc620 at #loc587)) +#loc783 = loc(callsite(#loc621 at #loc587)) +#loc784 = loc(callsite(#loc622 at #loc587)) +#loc785 = loc(callsite(#loc623 at #loc587)) +#loc786 = loc(callsite(#loc624 at #loc587)) +#loc787 = loc(callsite(#loc625 at #loc587)) +#loc788 = loc(callsite(#loc626 at #loc587)) +#loc789 = loc(callsite(#loc627 at #loc587)) +#loc790 = loc(callsite(#loc628 at #loc587)) +#loc791 = loc(callsite(#loc629 at #loc587)) +#loc792 = loc(callsite(#loc630 at #loc587)) +#loc793 = loc(callsite(#loc631 at #loc587)) +#loc794 = loc(callsite(#loc632 at #loc587)) +#loc795 = loc(callsite(#loc633 at #loc587)) +#loc796 = loc(callsite(#loc634 at #loc587)) +#loc797 = loc(callsite(#loc635 at #loc587)) +#loc798 = loc(callsite(#loc636 at #loc587)) +#loc799 = loc(callsite(#loc637 at #loc587)) +#loc800 = loc(callsite(#loc638 at #loc587)) +#loc801 = loc(callsite(#loc639 at #loc587)) +#loc802 = loc(callsite(#loc640 at #loc587)) +#loc803 = loc(callsite(#loc641 at #loc587)) +#loc804 = loc(callsite(#loc642 at #loc587)) +#loc805 = loc(callsite(#loc643 at #loc587)) +#loc806 = loc(callsite(#loc644 at #loc587)) +#loc807 = loc(callsite(#loc645 at #loc587)) +#loc808 = loc(callsite(#loc646 at #loc587)) +#loc809 = loc(callsite(#loc647 at #loc587)) +#loc810 = loc(callsite(#loc648 at #loc587)) +#loc811 = loc(callsite(#loc649 at #loc587)) +#loc812 = loc(callsite(#loc650 at #loc231)) +#loc813 = loc(callsite(#loc651 at #loc231)) +#loc814 = loc(callsite(#loc652 at #loc231)) +#loc815 = loc(callsite(#loc653 at #loc231)) +#loc816 = loc(callsite(#loc654 at #loc231)) +#loc817 = loc(callsite(#loc655 at #loc231)) +#loc818 = loc(callsite(#loc572 at #loc323)) +#loc819 = loc(callsite(#loc573 at #loc323)) +#loc820 = loc(callsite(#loc574 at #loc323)) +#loc821 = loc(callsite(#loc576 at #loc323)) +#loc822 = loc(callsite(#loc577 at #loc323)) +#loc823 = loc(callsite(#loc578 at #loc323)) +#loc824 = loc(callsite(#loc579 at #loc323)) +#loc825 = loc(callsite(#loc580 at #loc323)) +#loc826 = loc(callsite(#loc581 at #loc323)) +#loc827 = loc(callsite(#loc584 at #loc323)) +#loc828 = loc(callsite(#loc586 at #loc663)) +#loc829 = loc(callsite(#loc588 at #loc663)) +#loc830 = loc(callsite(#loc589 at #loc663)) +#loc831 = loc(callsite(#loc590 at #loc663)) +#loc832 = loc(callsite(#loc591 at #loc663)) +#loc833 = loc(callsite(#loc592 at #loc663)) +#loc834 = loc(callsite(#loc593 at #loc663)) +#loc835 = loc(callsite(#loc594 at #loc663)) +#loc836 = loc(callsite(#loc597 at #loc663)) +#loc837 = loc(callsite(#loc631 at #loc663)) +#loc838 = loc(callsite(#loc632 at #loc663)) +#loc839 = loc(callsite(#loc633 at #loc663)) +#loc840 = loc(callsite(#loc634 at #loc663)) +#loc841 = loc(callsite(#loc635 at #loc663)) +#loc842 = loc(callsite(#loc636 at #loc663)) +#loc843 = loc(callsite(#loc637 at #loc663)) +#loc844 = loc(callsite(#loc638 at #loc663)) +#loc845 = loc(callsite(#loc639 at #loc663)) +#loc846 = loc(callsite(#loc640 at #loc663)) +#loc847 = loc(callsite(#loc641 at #loc663)) +#loc848 = loc(callsite(#loc642 at #loc663)) +#loc849 = loc(callsite(#loc643 at #loc663)) +#loc850 = loc(callsite(#loc644 at #loc663)) +#loc851 = loc(callsite(#loc645 at #loc663)) +#loc852 = loc(callsite(#loc647 at #loc663)) +#loc853 = loc(callsite(#loc648 at #loc663)) +#loc854 = loc(callsite(#loc649 at #loc663)) +#loc855 = loc(callsite(#loc650 at #loc323)) +#loc856 = loc(callsite(#loc651 at #loc323)) +#loc857 = loc(callsite(#loc652 at #loc323)) +#loc858 = loc(callsite(#loc653 at #loc323)) +#loc859 = loc(callsite(#loc654 at #loc323)) +#loc860 = loc(callsite(#loc655 at #loc323)) +#loc861 = loc(callsite(#loc14 at #loc698)) +#loc862 = loc(callsite(#loc16 at #loc698)) +#loc863 = loc("kT_ptrs"(#loc701)) +#loc864 = loc(callsite(#loc451 at #loc702)) +#loc865 = loc(callsite(#loc453 at #loc702)) +#loc866 = loc(callsite(#loc454 at #loc702)) +#loc867 = loc(callsite(#loc455 at #loc702)) +#loc868 = loc(callsite(#loc456 at #loc702)) +#loc869 = loc(callsite(#loc457 at #loc702)) +#loc870 = loc(callsite(#loc458 at #loc702)) +#loc871 = loc(callsite(#loc459 at #loc702)) +#loc872 = loc(callsite(#loc460 at #loc702)) +#loc873 = loc(callsite(#loc461 at #loc702)) +#loc874 = loc(callsite(#loc462 at #loc702)) +#loc875 = loc(callsite(#loc463 at #loc702)) +#loc876 = loc(callsite(#loc464 at #loc702)) +#loc877 = loc(callsite(#loc465 at #loc702)) +#loc878 = loc(callsite(#loc466 at #loc702)) +#loc879 = loc(callsite(#loc467 at #loc702)) +#loc880 = loc(callsite(#loc468 at #loc702)) +#loc881 = loc(callsite(#loc469 at #loc702)) +#loc882 = loc(callsite(#loc470 at #loc702)) +#loc883 = loc(callsite(#loc471 at #loc702)) +#loc884 = loc(callsite(#loc472 at #loc702)) +#loc885 = loc(callsite(#loc473 at #loc702)) +#loc886 = loc(callsite(#loc474 at #loc702)) +#loc887 = loc(callsite(#loc475 at #loc702)) +#loc888 = loc(callsite(#loc476 at #loc702)) +#loc889 = loc(callsite(#loc477 at #loc702)) +#loc890 = loc(callsite(#loc478 at #loc702)) +#loc891 = loc(callsite(#loc479 at #loc702)) +#loc892 = loc(callsite(#loc480 at #loc702)) +#loc893 = loc(callsite(#loc481 at #loc702)) +#loc894 = loc(callsite(#loc482 at #loc702)) +#loc895 = loc(callsite(#loc483 at #loc702)) +#loc896 = loc(callsite(#loc484 at #loc702)) +#loc897 = loc(callsite(#loc485 at #loc702)) +#loc898 = loc(callsite(#loc486 at #loc702)) +#loc899 = loc(callsite(#loc487 at #loc702)) +#loc900 = loc(callsite(#loc488 at #loc702)) +#loc901 = loc(callsite(#loc489 at #loc702)) +#loc902 = loc(callsite(#loc490 at #loc702)) +#loc903 = loc(callsite(#loc491 at #loc702)) +#loc904 = loc(callsite(#loc492 at #loc702)) +#loc905 = loc(callsite(#loc493 at #loc702)) +#loc906 = loc(callsite(#loc494 at #loc702)) +#loc907 = loc(callsite(#loc495 at #loc702)) +#loc908 = loc(callsite(#loc496 at #loc702)) +#loc909 = loc(callsite(#loc497 at #loc702)) +#loc910 = loc(callsite(#loc498 at #loc702)) +#loc911 = loc(callsite(#loc499 at #loc702)) +#loc912 = loc(callsite(#loc500 at #loc702)) +#loc913 = loc(callsite(#loc501 at #loc702)) +#loc914 = loc(callsite(#loc502 at #loc702)) +#loc915 = loc(callsite(#loc503 at #loc702)) +#loc916 = loc(callsite(#loc504 at #loc703)) +#loc917 = loc(callsite(#loc506 at #loc703)) +#loc918 = loc(callsite(#loc507 at #loc703)) +#loc919 = loc(callsite(#loc508 at #loc703)) +#loc920 = loc(callsite(#loc509 at #loc703)) +#loc921 = loc(callsite(#loc510 at #loc703)) +#loc922 = loc(callsite(#loc511 at #loc703)) +#loc923 = loc(callsite(#loc512 at #loc703)) +#loc924 = loc(callsite(#loc513 at #loc703)) +#loc925 = loc(callsite(#loc514 at #loc703)) +#loc926 = loc(callsite(#loc515 at #loc703)) +#loc927 = loc(callsite(#loc516 at #loc703)) +#loc928 = loc(callsite(#loc517 at #loc703)) +#loc929 = loc(callsite(#loc518 at #loc703)) +#loc930 = loc(callsite(#loc519 at #loc703)) +#loc931 = loc(callsite(#loc520 at #loc703)) +#loc932 = loc(callsite(#loc521 at #loc703)) +#loc933 = loc(callsite(#loc451 at #loc717)) +#loc934 = loc(callsite(#loc453 at #loc717)) +#loc935 = loc(callsite(#loc454 at #loc717)) +#loc936 = loc(callsite(#loc457 at #loc717)) +#loc937 = loc(callsite(#loc491 at #loc717)) +#loc938 = loc(callsite(#loc492 at #loc717)) +#loc939 = loc(callsite(#loc493 at #loc717)) +#loc940 = loc(callsite(#loc494 at #loc717)) +#loc941 = loc(callsite(#loc495 at #loc717)) +#loc942 = loc(callsite(#loc496 at #loc717)) +#loc943 = loc(callsite(#loc497 at #loc717)) +#loc944 = loc(callsite(#loc498 at #loc717)) +#loc945 = loc(callsite(#loc499 at #loc717)) +#loc946 = loc(callsite(#loc501 at #loc717)) +#loc947 = loc(callsite(#loc502 at #loc717)) +#loc948 = loc(callsite(#loc503 at #loc717)) +#loc949 = loc(callsite(#loc504 at #loc718)) +#loc950 = loc(callsite(#loc506 at #loc718)) +#loc951 = loc(callsite(#loc507 at #loc718)) +#loc952 = loc(callsite(#loc508 at #loc718)) +#loc953 = loc(callsite(#loc509 at #loc718)) +#loc954 = loc(callsite(#loc510 at #loc718)) +#loc955 = loc(callsite(#loc511 at #loc718)) +#loc956 = loc(callsite(#loc512 at #loc718)) +#loc957 = loc(callsite(#loc513 at #loc718)) +#loc958 = loc(callsite(#loc514 at #loc718)) +#loc959 = loc(callsite(#loc515 at #loc718)) +#loc960 = loc(callsite(#loc516 at #loc718)) +#loc961 = loc(callsite(#loc517 at #loc718)) +#loc962 = loc(callsite(#loc518 at #loc718)) +#loc963 = loc(callsite(#loc519 at #loc718)) +#loc964 = loc(callsite(#loc520 at #loc718)) +#loc965 = loc(callsite(#loc521 at #loc718)) +#loc966 = loc(callsite(#loc14 at #loc745)) +#loc967 = loc(callsite(#loc16 at #loc745)) +#loc968 = loc("offs_m1"(#loc748)) +#loc969 = loc(callsite(#loc101 at #loc749)) +#loc970 = loc(callsite(#loc104 at #loc749)) +#loc971 = loc(callsite(#loc105 at #loc749)) +#loc972 = loc(callsite(#loc108 at #loc757)) +#loc973 = loc(callsite(#loc108 at #loc758)) +#loc974 = loc(callsite(#loc294 at #loc797)) +#loc975 = loc(callsite(#loc70 at #loc797)) +#loc976 = loc(callsite(#loc71 at #loc797)) +#loc977 = loc(callsite(#loc504 at #loc812)) +#loc978 = loc(callsite(#loc506 at #loc812)) +#loc979 = loc(callsite(#loc507 at #loc812)) +#loc980 = loc(callsite(#loc508 at #loc812)) +#loc981 = loc(callsite(#loc509 at #loc812)) +#loc982 = loc(callsite(#loc510 at #loc812)) +#loc983 = loc(callsite(#loc511 at #loc812)) +#loc984 = loc(callsite(#loc512 at #loc812)) +#loc985 = loc(callsite(#loc513 at #loc812)) +#loc986 = loc(callsite(#loc514 at #loc812)) +#loc987 = loc(callsite(#loc515 at #loc812)) +#loc988 = loc(callsite(#loc516 at #loc812)) +#loc989 = loc(callsite(#loc517 at #loc812)) +#loc990 = loc(callsite(#loc518 at #loc812)) +#loc991 = loc(callsite(#loc519 at #loc812)) +#loc992 = loc(callsite(#loc520 at #loc812)) +#loc993 = loc(callsite(#loc521 at #loc812)) +#loc994 = loc(callsite(#loc101 at #loc828)) +#loc995 = loc(callsite(#loc104 at #loc828)) +#loc996 = loc(callsite(#loc105 at #loc828)) +#loc997 = loc(callsite(#loc294 at #loc841)) +#loc998 = loc(callsite(#loc70 at #loc841)) +#loc999 = loc(callsite(#loc71 at #loc841)) +#loc1000 = loc(callsite(#loc504 at #loc855)) +#loc1001 = loc(callsite(#loc506 at #loc855)) +#loc1002 = loc(callsite(#loc507 at #loc855)) +#loc1003 = loc(callsite(#loc508 at #loc855)) +#loc1004 = loc(callsite(#loc509 at #loc855)) +#loc1005 = loc(callsite(#loc510 at #loc855)) +#loc1006 = loc(callsite(#loc511 at #loc855)) +#loc1007 = loc(callsite(#loc512 at #loc855)) +#loc1008 = loc(callsite(#loc513 at #loc855)) +#loc1009 = loc(callsite(#loc514 at #loc855)) +#loc1010 = loc(callsite(#loc515 at #loc855)) +#loc1011 = loc(callsite(#loc516 at #loc855)) +#loc1012 = loc(callsite(#loc517 at #loc855)) +#loc1013 = loc(callsite(#loc518 at #loc855)) +#loc1014 = loc(callsite(#loc519 at #loc855)) +#loc1015 = loc(callsite(#loc520 at #loc855)) +#loc1016 = loc(callsite(#loc521 at #loc855)) +#loc1017 = loc("vT_ptrs"(#loc863)) +#loc1018 = loc(callsite(#loc101 at #loc864)) +#loc1019 = loc(callsite(#loc104 at #loc864)) +#loc1020 = loc(callsite(#loc105 at #loc864)) +#loc1021 = loc(callsite(#loc108 at #loc867)) +#loc1022 = loc(callsite(#loc108 at #loc868)) +#loc1023 = loc(callsite(#loc105 at #loc906)) +#loc1024 = loc(callsite(#loc101 at #loc933)) +#loc1025 = loc(callsite(#loc104 at #loc933)) +#loc1026 = loc(callsite(#loc105 at #loc933)) +#loc1027 = loc(callsite(#loc105 at #loc940)) +#loc1028 = loc("qT_ptrs"(#loc968)) +#loc1029 = loc(callsite(#loc1017 at #loc439)) +#loc1030 = loc(callsite(#loc1017 at #loc532)) +#loc1031 = loc("do_ptrs"(#loc1028)) +#loc1032 = loc(callsite(#loc1031 at #loc231)) +#loc1033 = loc(callsite(#loc1031 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..826f51bb1d1f1de94bf946f8b2d55ac7367d7f94 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..5cc115cc42346ff264338924e81bc6771a927f11 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a9511149dc32a2d666aeaefa4ec5b43ed6533d3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "571e82f1c43639fab6318c86608c3ec5501b4c57931b7c7476d3dfbfbd6a706d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..e8ece7e968c1da9eeeb294b5664d61302b66ec6b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,266 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 128, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 31, !dbg !12 + %16 = zext nneg i32 %12 to i64, !dbg !13 + %17 = mul i64 %5, %16, !dbg !13 + %18 = icmp sgt i32 %8, 0, !dbg !14 + br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14 + +.lr.ph: ; preds = %11 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %17 + br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %15, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18 + %26 = add i32 %20, 32, !dbg !14 + %27 = icmp slt i32 %26, %8, !dbg !14 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %15, !dbg !15 + %31 = icmp slt i32 %30, %8, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18 + %narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow16 to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 32, !dbg !14 + %38 = icmp slt i32 %37, %8, !dbg !14 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11 + %.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset3 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset4 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset5 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %extelt.offset6 = lshr i64 %70, 32, !dbg !22 + %71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %72 = trunc i64 %70 to i32, !dbg !22 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22 + %77 = bitcast <2 x i32> %76 to i64, !dbg !22 + %78 = add i64 %70, %77, !dbg !26 + %79 = trunc i64 %78 to i32, !dbg !27 + %80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28 + %81 = and i32 %14, 32, !dbg !29 + %82 = icmp eq i32 %81, 0, !dbg !29 + %83 = and i32 %14, 63, !dbg !29 + %84 = icmp eq i32 %83, 0, !dbg !29 + %85 = and i1 %13, %84, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29 + %86 = icmp slt i64 %5, 2, !dbg !30 + %87 = icmp sgt i64 %5, 1, !dbg !31 + %88 = select i1 %87, i64 %5, i64 0, !dbg !32 + %89 = zext i1 %86 to i64, !dbg !33 + %90 = add i64 %88, %89, !dbg !34 + %91 = mul i64 %90, %16, !dbg !35 + %92 = add i64 %5, 1, !dbg !36 + %93 = add i64 %6, 127, !dbg !37 + %94 = sdiv i64 %93, 128, !dbg !38 + %95 = and i64 %93, 127, !dbg !42 + %.not = icmp ne i64 %95, 0, !dbg !42 + %96 = icmp slt i64 %93, 0, !dbg !43 + %narrow = and i1 %96, %.not, !dbg !44 + %97 = sext i1 %narrow to i64, !dbg !44 + %98 = add nsw i64 %94, %97, !dbg !44 + br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +.lr.ph14: ; preds = %._crit_edge, %119 + %99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ] + %100 = or disjoint i32 %99, %15, !dbg !46 + %101 = icmp slt i32 %100, %8, !dbg !47 + %102 = sext i32 %100 to i64, !dbg !48 + %103 = add i64 %91, %102, !dbg !48 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49 + %105 = and i1 %13, %101, !dbg !50 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51 + %110 = icmp slt i32 %100, %79, !dbg !52 + %sext7 = shl i64 %109, 32, !dbg !53 + %111 = ashr exact i64 %sext7, 32, !dbg !53 + %112 = select i1 %110, i64 %111, i64 %5, !dbg !53 + %113 = icmp slt i64 %112, 0, !dbg !54 + %114 = select i1 %113, i64 %92, i64 0, !dbg !55 + %115 = add i64 %114, %112, !dbg !55 + %116 = icmp slt i64 %115, 0, !dbg !56 + %117 = icmp sgt i64 %115, %98, !dbg !57 + %.not12 = or i1 %116, %117, !dbg !58 + %.not9 = and i1 %105, %.not12, !dbg !59 + br i1 %.not9, label %118, label %119, !dbg !59 + +118: ; preds = %.lr.ph14 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59 + unreachable, !dbg !59 + +119: ; preds = %.lr.ph14 + %sext = shl i64 %107, 32, !dbg !53 + %120 = ashr exact i64 %sext, 32, !dbg !53 + %121 = select i1 %110, i64 %120, i64 %5, !dbg !53 + %122 = icmp slt i64 %121, 0, !dbg !54 + %123 = select i1 %122, i64 %92, i64 0, !dbg !55 + %124 = trunc i64 %109 to i32, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61 + %126 = and i1 %82, %105, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62 + %127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63 + %128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63 + %129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63 + %130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64 + %131 = add i32 %99, 32, !dbg !45 + %132 = icmp slt i32 %131, %8, !dbg !45 + br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +._crit_edge15: ; preds = %119, %._crit_edge + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 35, column: 45, scope: !9) +!14 = !DILocation(line: 29, column: 40, scope: !9) +!15 = !DILocation(line: 30, column: 31, scope: !9) +!16 = !DILocation(line: 35, column: 41, scope: !9) +!17 = !DILocation(line: 35, column: 34, scope: !9) +!18 = !DILocation(line: 35, column: 50, scope: !9) +!19 = !DILocation(line: 31, column: 29, scope: !9) +!20 = !DILocation(line: 39, column: 48, scope: !9) +!21 = !DILocation(line: 28, column: 43, scope: !9) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 40, column: 25, scope: !9) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 41, column: 19, scope: !9) +!28 = !DILocation(line: 42, column: 25, scope: !9) +!29 = !DILocation(line: 42, column: 36, scope: !9) +!30 = !DILocation(line: 49, column: 60, scope: !9) +!31 = !DILocation(line: 49, column: 86, scope: !9) +!32 = !DILocation(line: 49, column: 77, scope: !9) +!33 = !DILocation(line: 49, scope: !9) +!34 = !DILocation(line: 49, column: 68, scope: !9) +!35 = !DILocation(line: 49, column: 45, scope: !9) +!36 = !DILocation(line: 55, column: 20, scope: !9) +!37 = !DILocation(line: 59, column: 94, scope: !9) +!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!41 = !DILocation(line: 59, column: 100, scope: !9) +!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41) +!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41) +!45 = !DILocation(line: 43, column: 40, scope: !9) +!46 = !DILocation(line: 44, column: 31, scope: !9) +!47 = !DILocation(line: 45, column: 29, scope: !9) +!48 = !DILocation(line: 49, column: 41, scope: !9) +!49 = !DILocation(line: 49, column: 34, scope: !9) +!50 = !DILocation(line: 49, column: 103, scope: !9) +!51 = !DILocation(line: 49, column: 93, scope: !9) +!52 = !DILocation(line: 52, column: 22, scope: !9) +!53 = !DILocation(line: 54, column: 37, scope: !9) +!54 = !DILocation(line: 57, column: 24, scope: !9) +!55 = !DILocation(line: 58, column: 39, scope: !9) +!56 = !DILocation(line: 59, column: 32, scope: !9) +!57 = !DILocation(line: 59, column: 50, scope: !9) +!58 = !DILocation(line: 59, column: 112, scope: !9) +!59 = !DILocation(line: 59, column: 130, scope: !9) +!60 = !DILocation(line: 50, column: 23, scope: !9) +!61 = !DILocation(line: 61, column: 29, scope: !9) +!62 = !DILocation(line: 61, column: 94, scope: !9) +!63 = !DILocation(line: 62, column: 29, scope: !9) +!64 = !DILocation(line: 62, column: 95, scope: !9) +!65 = !DILocation(line: 43, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..dda2bb86850f749e7f9e3166157a3be973eba112 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,640 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 108, 55, 47, 99, 108, 55, 117, 111, 121, 111, 52, 114, 50, 113, 106, 54, 98, 119, 106, 103, 121, 102, 112, 50, 113, 111, 105, 115, 122, 122, 120, 119, 52, 120, 109, 120, 111, 108, 98, 112, 101, 109, 54, 97, 52, 102, 117, 106, 109, 108, 108, 105, 118, 111, 115, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<32>; + .reg .b32 %r<53>; + .reg .b64 %rd<103>; + .loc 1 18 0 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8]; + ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2]; +$L__tmp0: + .loc 1 22 28 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 25 37 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + .loc 1 35 45 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:35:45 + cvt.u64.u32 %rd1, %r13; + mul.lo.s64 %rd2, %rd18, %rd1; + .loc 1 29 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:29:40 + setp.lt.s32 %p2, %r12, 1; + mov.b64 %rd102, 0; + cvt.u32.u64 %r49, %rd1; + shl.b64 %rd100, %rd2, 2; + @%p2 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0]; + .loc 1 24 21 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:24:21 + setp.lt.u32 %p3, %r49, 128; + add.s64 %rd3, %rd13, %rd100; + @%p3 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 21 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:0:21 + mov.b32 %r51, 0; + mov.b64 %rd102, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 29 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:31:29 + add.s32 %r20, %r2, %r51; + setp.lt.s32 %p6, %r20, %r12; + .loc 1 35 34 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:35:34 + mad.wide.s32 %rd28, %r20, 4, %rd3; + .loc 1 35 50 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:35:50 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27; + // end inline asm + .loc 1 39 48 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:39:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd30, %r21; + add.s64 %rd102, %rd102, %rd30; + .loc 1 29 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:29:40 + add.s32 %r51, %r51, 32; + setp.lt.s32 %p7, %r51, %r12; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:0:40 + mov.b32 %r50, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 35 41 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:35:41 + add.s32 %r17, %r2, %r50; + .loc 1 35 34 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:35:34 + mad.wide.s32 %rd23, %r17, 4, %rd3; + .loc 1 35 50 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:35:50 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22; + // end inline asm + .loc 1 29 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:29:40 + add.s32 %r50, %r50, 32; + setp.lt.s32 %p5, %r50, %r12; + @%p5 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 24 21 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:24:21 + setp.lt.u32 %p10, %r49, 128; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + mov.b64 {_, %r24}, %rd102; + cvt.u32.u64 %r25, %rd102; + shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1; + cvt.u64.u32 %rd32, %r26; + cvt.u64.u32 %rd33, %r27; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + add.s64 %rd36, %rd102, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + mov.b64 {_, %r28}, %rd36; + cvt.u32.u64 %r29, %rd36; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1; + cvt.u64.u32 %rd37, %r30; + cvt.u64.u32 %rd38, %r31; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + mov.b64 {_, %r32}, %rd41; + cvt.u32.u64 %r33, %rd41; + shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1; + cvt.u64.u32 %rd42, %r34; + cvt.u64.u32 %rd43, %r35; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + add.s64 %rd46, %rd41, %rd45; + .loc 2 291 36 // standard.py:291:36 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + mov.b64 {_, %r36}, %rd46; + cvt.u32.u64 %r37, %rd46; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1; + cvt.u64.u32 %rd47, %r38; + cvt.u64.u32 %rd48, %r39; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 2 261 15 // standard.py:261:15 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + add.s64 %rd51, %rd46, %rd50; + .loc 2 291 36 // standard.py:291:36 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + mov.b64 {_, %r40}, %rd51; + cvt.u32.u64 %r41, %rd51; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1; + cvt.u64.u32 %rd52, %r42; + .loc 2 261 15 // standard.py:261:15 @[ cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:40:25 ] + add.s64 %rd53, %rd51, %rd52; +$L__tmp2: + .loc 1 41 19 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:41:19 + cvt.u32.u64 %r22, %rd53; + .loc 1 42 25 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:42:25 + shl.b64 %rd54, %rd1, 2; + add.s64 %rd31, %rd15, %rd54; + .loc 1 42 36 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:42:36 + and.b32 %r44, %r1, 63; + setp.eq.b32 %p11, %r44, 0; + and.pred %p8, %p10, %p11; + // begin inline asm + @%p8 st.global.b32 [ %rd31 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:43:40 + @%p2 bra $L__BB0_11; +// %bb.7: // %.lr.ph14.preheader + .loc 1 0 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:0:40 + ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1]; + and.b32 %r8, %r1, 32; + setp.lt.s64 %p12, %rd18, 2; + setp.gt.s64 %p13, %rd18, 1; + selp.b64 %rd55, %rd18, 0, %p13; + selp.b64 %rd56, 1, 0, %p12; + add.s64 %rd57, %rd55, %rd56; + mul.lo.s64 %rd7, %rd57, %rd1; + add.s64 %rd8, %rd18, 1; + add.s64 %rd58, %rd19, 127; + shr.s64 %rd59, %rd58, 63; + shr.u64 %rd60, %rd59, 57; + add.s64 %rd61, %rd58, %rd60; + shr.s64 %rd62, %rd61, 7; + and.b64 %rd63, %rd58, 127; + setp.ne.b64 %p14, %rd63, 0; + setp.lt.s64 %p15, %rd58, 0; + and.pred %p16, %p15, %p14; + selp.b64 %rd64, -1, 0, %p16; + add.s64 %rd9, %rd62, %rd64; + mov.b32 %r52, 0; +$L__BB0_8: // %.lr.ph14 + // =>This Inner Loop Header: Depth=1 + .loc 1 45 29 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:45:29 + add.s32 %r10, %r2, %r52; + setp.lt.s32 %p20, %r10, %r12; + .loc 1 49 41 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:49:41 + cvt.s64.s32 %rd73, %r10; + add.s64 %rd10, %rd7, %rd73; + .loc 1 49 34 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:49:34 + shl.b64 %rd74, %rd10, 3; + add.s64 %rd67, %rd14, %rd74; + .loc 1 49 103 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:49:103 + and.pred %p18, %p10, %p20; + .loc 1 49 93 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:49:93 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69; + // end inline asm + .loc 1 52 22 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:52:22 + setp.lt.s32 %p21, %r10, %r22; + .loc 1 54 37 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:54:37 + cvt.s64.s32 %rd75, %rd70; + selp.b64 %rd76, %rd75, %rd18, %p21; + .loc 1 58 39 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:58:39 + shr.s64 %rd77, %rd76, 63; + and.b64 %rd78, %rd77, %rd8; + add.s64 %rd79, %rd78, %rd76; + .loc 1 59 32 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:59:32 + setp.lt.s64 %p22, %rd79, 0; + .loc 1 59 50 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:59:50 + setp.gt.s64 %p23, %rd79, %rd9; + .loc 1 59 112 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:59:112 + or.pred %p24, %p22, %p23; + .loc 1 59 130 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:59:130 + and.pred %p25, %p18, %p24; + not.pred %p26, %p25; + @%p26 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_8 Depth=1 + .loc 1 42 36 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:42:36 + setp.eq.b32 %p30, %r8, 0; + .loc 1 54 37 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:54:37 + cvt.s64.s32 %rd82, %rd66; + selp.b64 %rd83, %rd82, %rd18, %p21; + .loc 1 58 39 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:58:39 + shr.s64 %rd84, %rd83, 63; + and.b64 %rd85, %rd84, %rd8; + .loc 1 50 23 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:50:23 + cvt.u32.u64 %r47, %rd70; + .loc 1 59 130 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:59:130 + bar.sync 0; + .loc 1 61 29 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:61:29 + shl.b64 %rd86, %rd10, 2; + add.s64 %rd80, %rd16, %rd86; + .loc 1 61 94 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:61:94 + and.pred %p27, %p30, %p18; + // begin inline asm + @%p27 st.global.b32 [ %rd80 + 0 ], { %r47 }; + // end inline asm + .loc 1 62 29 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:62:29 + shl.b64 %rd87, %rd83, 2; + add.s64 %rd88, %rd17, %rd87; + shl.b64 %rd89, %rd85, 2; + add.s64 %rd90, %rd88, %rd89; + add.s64 %rd92, %rd90, %rd54; + add.s64 %rd81, %rd92, %rd100; + mov.b32 %r48, 1; + .loc 1 62 95 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:62:95 + // begin inline asm + @%p27 st.global.b32 [ %rd81 + 0 ], { %r48 }; + // end inline asm + .loc 1 43 40 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:43:40 + add.s32 %r52, %r52, 32; + setp.lt.s32 %p31, %r52, %r12; + @%p31 bra $L__BB0_8; +$L__BB0_11: // %._crit_edge15 + .loc 1 43 4 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:43:4 + ret; +$L__BB0_9: + .loc 1 59 130 // cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py:59:130 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd94, assertFunc_0; + cvta.global.u64 %rd95, %rd94; + st.param.b64 [param3], %rd95; + mov.b64 %rd96, assertFile_0; + cvta.global.u64 %rd97, %rd96; + st.param.b64 [param1], %rd97; + mov.b64 %rd98, assertMessage_0; + cvta.global.u64 %rd99, %rd98; + st.param.b64 [param0], %rd99; + st.param.b64 [param4], 1; + st.param.b32 [param2], 59; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 281 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 108 +.b8 55 +.b8 117 +.b8 111 +.b8 121 +.b8 111 +.b8 52 +.b8 114 +.b8 50 +.b8 113 +.b8 106 +.b8 54 +.b8 98 +.b8 119 +.b8 106 +.b8 103 +.b8 121 +.b8 102 +.b8 112 +.b8 50 +.b8 113 +.b8 111 +.b8 105 +.b8 115 +.b8 122 +.b8 122 +.b8 120 +.b8 119 +.b8 52 +.b8 120 +.b8 109 +.b8 120 +.b8 111 +.b8 108 +.b8 98 +.b8 112 +.b8 101 +.b8 109 +.b8 54 +.b8 97 +.b8 52 +.b8 102 +.b8 117 +.b8 106 +.b8 109 +.b8 108 +.b8 108 +.b8 105 +.b8 118 +.b8 111 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 108 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..b0f3beedba51c0776355505841d58a2f7f2f1a24 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source @@ -0,0 +1,379 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":18:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc79 = loc(unknown) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("out_ptr3"(#loc)) +#loc102 = loc("ks0"(#loc)) +#loc103 = loc("ks1"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc151 = loc("input"(#loc77)) +#loc152 = loc("a"(#loc82)) +#loc153 = loc("b"(#loc82)) +#loc154 = loc("a"(#loc86)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114) + %_tmp3 = arith.constant 0 : i64 loc(#loc115) + %_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c32_i32 = arith.constant 32 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118) + %tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119) + %tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119) + %tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119) + %tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc121) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc121) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123) + %tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc123) + %tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124) + %tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125) + %_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126) + %_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126) + %_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127) + scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23) + } loc(#loc116) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128) + %tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc28) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc29) + %c32_i32_13 = arith.constant 32 : i32 loc(#loc29) + %6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29) + %8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29) + %9 = ub.poison : i32 loc(#loc29) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132) + %tmp6 = arith.constant 1 : i32 loc(#loc133) + %tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133) + %tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133) + %tmp6_18 = arith.constant 1 : i32 loc(#loc134) + %tmp6_19 = arith.constant 1 : i32 loc(#loc134) + %tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134) + %tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134) + %tmp6_22 = arith.constant 1 : i32 loc(#loc135) + %tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135) + %tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135) + %tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136) + %tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136) + %tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137) + %tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137) + %tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138) + %tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138) + %tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139) + %tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139) + %tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139) + %tmp6_35 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc140) + %tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc140) + %tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141) + %tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141) + %tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142) + %tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142) + %tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142) + %tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc142) + %tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144) + %tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145) + %tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145) + %tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145) + %tmp12 = arith.constant 1 : i32 loc(#loc146) + %tmp12_46 = arith.constant 1 : i64 loc(#loc146) + %tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146) + %tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147) + %tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147) + %tmp14 = arith.constant 0 : i32 loc(#loc148) + %tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148) + %tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148) + %tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148) + %tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc49) + %10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49) + %11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49) + %12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49) + %c127_i32 = arith.constant 127 : i32 loc(#loc50) + %c127_i64 = arith.constant 127 : i64 loc(#loc50) + %13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50) + %14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51) + %c1_i32 = arith.constant 1 : i32 loc(#loc52) + %c1_i64 = arith.constant 1 : i64 loc(#loc52) + %15 = arith.addi %c1_i64, %14 : i64 loc(#loc52) + %16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53) + %17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53) + %18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54) + %19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55) + %20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55) + %true = arith.constant true loc(#loc56) + %cst = arith.constant dense : tensor<1x32xi1> loc(#loc56) + %21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56) + %22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57) + tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58) + %tmp17 = arith.constant 1 : i32 loc(#loc150) + %tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc60) + %23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc61) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc61) + %25 = arith.extui %24 : i1 to i32 loc(#loc61) + %26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61) + %c1_i32_57 = arith.constant 1 : i32 loc(#loc62) + %27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62) + %29 = arith.extui %28 : i1 to i64 loc(#loc63) + %30 = arith.muli %ks0, %29 : i64 loc(#loc63) + %31 = arith.extsi %26 : i32 to i64 loc(#loc64) + %32 = arith.addi %31, %30 : i64 loc(#loc64) + %33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65) + %35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65) + %36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66) + %37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66) + %38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66) + %39 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc67) + %40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc67) + %41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68) + %42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68) + tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr> loc(#loc69) + %43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70) + %44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70) + %45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70) + %46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71) + %47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71) + %48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71) + %49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72) + %50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72) + %51 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc73) + %52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc73) + %53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74) + %54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74) + %cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75) + tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr> loc(#loc75) + } loc(#loc29) + tt.return loc(#loc76) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78) + tt.reduce.return %2 : i64 loc(#loc78) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78) + tt.return %0 : tensor<1xi64> loc(#loc80) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc81) + tt.return %1 : tensor<1xi64> loc(#loc81) + } loc(#loc77) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc83) + tt.return %0 : i64 loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc85) + tt.return %1 : i64 loc(#loc85) + } loc(#loc82) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc155) + %quot_0 = arith.constant 128 : i64 loc(#loc155) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155) + %remainder = arith.constant 128 : i32 loc(#loc156) + %remainder_2 = arith.constant 128 : i64 loc(#loc156) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156) + %fixed = arith.constant 0 : i32 loc(#loc157) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157) + %fixed_6 = arith.constant 1 : i32 loc(#loc158) + %fixed_7 = arith.constant 1 : i64 loc(#loc158) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159) + %c0_i32 = arith.constant 0 : i32 loc(#loc92) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92) + %false = arith.constant false loc(#loc93) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc93) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94) + tt.return %3 : i64 loc(#loc95) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc96) + tt.return %4 : i64 loc(#loc96) + } loc(#loc86) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":36:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":42:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":42:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":43:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":44:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":45:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:60) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:86) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:77) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:45) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:103) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":50:23) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":52:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":54:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":55:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":56:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":58:39) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:94) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:100) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:50) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:122) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:112) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:110) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:130) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":60:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:47) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:81) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:72) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:40) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:29) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:104) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:94) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:53) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:62) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:105) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:95) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":43:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc106 = loc("xnumel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("_tmp3"(#loc10)) +#loc116 = loc("_tmp3"(#loc11)) +#loc117 = loc("r0_index"(#loc12)) +#loc118 = loc("r0_mask"(#loc13)) +#loc119 = loc("tmp0"(#loc14)) +#loc120 = loc("tmp0"(#loc15)) +#loc121 = loc("tmp0"(#loc16)) +#loc122 = loc("tmp0"(#loc17)) +#loc123 = loc("tmp0"(#loc18)) +#loc124 = loc("tmp1"(#loc19)) +#loc125 = loc("tmp4"(#loc20)) +#loc126 = loc("_tmp3"(#loc21)) +#loc127 = loc("_tmp3"(#loc22)) +#loc128 = loc("tmp3"(#loc24)) +#loc129 = loc("tmp3"(#loc25)) +#loc130 = loc("tmp5"(#loc26)) +#loc131 = loc("r0_index"(#loc30)) +#loc132 = loc("r0_mask"(#loc31)) +#loc133 = loc("tmp6"(#loc32)) +#loc134 = loc("tmp6"(#loc33)) +#loc135 = loc("tmp6"(#loc34)) +#loc136 = loc("tmp6"(#loc35)) +#loc137 = loc("tmp6"(#loc36)) +#loc138 = loc("tmp6"(#loc37)) +#loc139 = loc("tmp6"(#loc38)) +#loc140 = loc("tmp6"(#loc39)) +#loc141 = loc("tmp6"(#loc40)) +#loc142 = loc("tmp6"(#loc41)) +#loc143 = loc("tmp7"(#loc42)) +#loc144 = loc("tmp9"(#loc43)) +#loc145 = loc("tmp11"(#loc44)) +#loc146 = loc("tmp12"(#loc45)) +#loc147 = loc("tmp13"(#loc46)) +#loc148 = loc("tmp14"(#loc47)) +#loc149 = loc("tmp15"(#loc48)) +#loc150 = loc("tmp17"(#loc59)) +#loc155 = loc("quot"(#loc87)) +#loc156 = loc("remainder"(#loc88)) +#loc157 = loc("fixed"(#loc89)) +#loc158 = loc("fixed"(#loc90)) +#loc159 = loc("fixed"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f50a7aa31d285df12b91f7f79e93284c3a3a3e88 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,271 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":40:25) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("out_ptr1"(#loc)) +#loc71 = loc("out_ptr2"(#loc)) +#loc72 = loc("out_ptr3"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("xnumel"(#loc)) +#loc76 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp3"(#loc18)) +#loc124 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc77) + %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79) + %r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79) + %r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81) + %tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc83) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83) + %tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84) + %tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc87) + %tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88) + %tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89) + %_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90) + scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16) + } loc(#loc85) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))): + %tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133) + tt.reduce.return %tmp3_33 : i64 loc(#loc123) + }) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20) + %tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92) + %tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93) + %tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93) + %1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24) + tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97) + %tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97) + %tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125) + %tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98) + %tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100) + %tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126) + %tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126) + %tmp6_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc102) + %tmp6_24 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc102) + %tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107) + %4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39) + %quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128) + %remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130) + %fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131) + %fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132) + %5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113) + %6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114) + %7 = arith.addi %6, %c1_i64 : i64 loc(#loc48) + %8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49) + %9 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc50) + %10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51) + %11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc54) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94) + %tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102) + %tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102) + %tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103) + %tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103) + %tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked> loc(#loc117) + %tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc117) + %tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118) + %tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118) + %tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107) + %tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119) + %tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119) + %tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120) + %tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120) + %13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61) + %14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49) + %15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62) + %16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63) + %17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64) + tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65) + %18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50) + tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr, #blocked1> loc(#loc66) + %19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51) + %20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52) + %21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54) + tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr, #blocked> loc(#loc20) + } loc(#loc55) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":29:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":30:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":36:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:95) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":45:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:86) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:77) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:68) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:45) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:103) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":52:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":54:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":55:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":56:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:94) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:100) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:58) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":43:40) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":44:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:93) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":50:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":57:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":58:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:32) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:112) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:110) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:130) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:94) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":43:4) +#loc77 = loc("xoffset"(#loc2)) +#loc78 = loc("xmask"(#loc3)) +#loc79 = loc("r0_base"(#loc4)) +#loc80 = loc("r0_mask"(#loc5)) +#loc81 = loc("tmp0"(#loc6)) +#loc82 = loc("tmp0"(#loc7)) +#loc83 = loc("tmp0"(#loc8)) +#loc84 = loc("tmp0"(#loc9)) +#loc85 = loc("_tmp3"(#loc10)) +#loc86 = loc("r0_index"(#loc11)) +#loc87 = loc("tmp0"(#loc12)) +#loc88 = loc("tmp1"(#loc13)) +#loc89 = loc("tmp4"(#loc14)) +#loc90 = loc("_tmp3"(#loc15)) +#loc92 = loc("tmp3"(#loc21)) +#loc93 = loc("tmp5"(#loc22)) +#loc94 = loc("r0_mask"(#loc25)) +#loc95 = loc("tmp6"(#loc26)) +#loc96 = loc("tmp6"(#loc27)) +#loc97 = loc("tmp6"(#loc28)) +#loc98 = loc("tmp6"(#loc29)) +#loc99 = loc("tmp6"(#loc30)) +#loc100 = loc("tmp6"(#loc31)) +#loc101 = loc("tmp6"(#loc32)) +#loc102 = loc("tmp6"(#loc33)) +#loc103 = loc("tmp6"(#loc34)) +#loc104 = loc("tmp9"(#loc35)) +#loc105 = loc("tmp11"(#loc36)) +#loc106 = loc("tmp12"(#loc37)) +#loc107 = loc("tmp13"(#loc38)) +#loc108 = loc("quot"(#loc40)) +#loc109 = loc("remainder"(#loc42)) +#loc110 = loc("fixed"(#loc43)) +#loc111 = loc("fixed"(#loc44)) +#loc112 = loc("fixed"(#loc45)) +#loc113 = loc(callsite(#loc46 at #loc41)) +#loc114 = loc(callsite(#loc47 at #loc41)) +#loc115 = loc(fused[#loc52, #loc53]) +#loc116 = loc("r0_index"(#loc56)) +#loc117 = loc("tmp6"(#loc57)) +#loc118 = loc("tmp7"(#loc58)) +#loc119 = loc("tmp14"(#loc59)) +#loc120 = loc("tmp15"(#loc60)) +#loc121 = loc(fused[#loc82, #loc81]) +#loc122 = loc(fused[#loc84, #loc78]) +#loc123 = loc(callsite(#loc17 at #loc91)) +#loc125 = loc(fused[#loc98, #loc99]) +#loc126 = loc(fused[#loc101, #loc100]) +#loc127 = loc(fused[#loc103, #loc78]) +#loc128 = loc(callsite(#loc108 at #loc41)) +#loc129 = loc(callsite(#loc109 at #loc41)) +#loc130 = loc(callsite(#loc110 at #loc41)) +#loc131 = loc(callsite(#loc111 at #loc41)) +#loc132 = loc(callsite(#loc112 at #loc41)) +#loc133 = loc(callsite(#loc19 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..571965231f78ac5634083e9612b624f5d8593bb5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,247 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":18:0) +#loc2 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":40:25) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("out_ptr3"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc93 = loc("tmp3"(#loc19)) +#loc126 = loc(callsite(#loc2 at #loc93)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 128 : i32 loc(#loc78) + %c128_i64 = arith.constant 128 : i64 loc(#loc2) + %c0_i64 = arith.constant 0 : i64 loc(#loc2) + %cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc2) + %c32_i32 = arith.constant 32 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc2) + %cst_1 = arith.constant dense : tensor<1x32xi1> loc(#loc2) + %c127_i64 = arith.constant 127 : i64 loc(#loc2) + %c1_i64 = arith.constant 1 : i64 loc(#loc2) + %cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xmask_3 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78) + %xmask_4 = tt.splat %xmask_3 : i1 -> tensor<1x1xi1> loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_7 = %cst_2) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83) + %r0_index_8 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32> loc(#loc83) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x32xi32> loc(#loc84) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85) + %tmp0_10 = arith.muli %ks0, %tmp0 : i64 loc(#loc85) + %tmp0_11 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86) + %tmp0_12 = tt.splat %tmp0_10 : i64 -> tensor<1x32xi64> loc(#loc123) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x32xi64> loc(#loc86) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc87) + %tmp0_15 = tt.addptr %tmp0_14, %tmp0_13 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc87) + %tmp0_16 = tt.splat %xmask_3 : i1 -> tensor<1x32xi1> loc(#loc124) + %tmp0_17 = arith.andi %r0_mask_9, %tmp0_16 : tensor<1x32xi1> loc(#loc88) + %tmp0_18 = tt.load %tmp0_15, %tmp0_17, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc89) + %tmp1 = arith.extsi %tmp0_18 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90) + %tmp4 = arith.addi %_tmp3_7, %tmp1 : tensor<1x32xi64> loc(#loc91) + %_tmp3_19 = arith.select %tmp0_17, %tmp4, %_tmp3_7 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92) + scf.yield %_tmp3_19 : tensor<1x32xi64> loc(#loc17) + } loc(#loc82) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_7: i64 loc(callsite(#loc2 at #loc93)), %tmp3_8: i64 loc(callsite(#loc2 at #loc93))): + %tmp3_9 = arith.addi %tmp3_7, %tmp3_8 : i64 loc(#loc135) + tt.reduce.return %tmp3_9 : i64 loc(#loc125) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125) + %tmp3_6 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94) + %tmp5 = arith.trunci %tmp3_6 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %1, %tmp5, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc24) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96) + %r0_index_7 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc97) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp6_9 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp6_10 = arith.extui %tmp6_9 : i1 to i64 loc(#loc100) + %tmp6_11 = arith.muli %ks0, %tmp6_10 : i64 loc(#loc100) + %tmp6_12 = arith.extui %tmp6 : i1 to i64 loc(#loc127) + %tmp6_13 = arith.addi %tmp6_12, %tmp6_11 : i64 loc(#loc101) + %tmp6_14 = arith.extsi %xoffset : i32 to i64 loc(#loc103) + %tmp6_15 = arith.muli %tmp6_14, %tmp6_13 : i64 loc(#loc103) + %tmp6_16 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104) + %tmp6_17 = tt.splat %tmp6_15 : i64 -> tensor<1x32xi64> loc(#loc128) + %tmp6_18 = arith.addi %tmp6_16, %tmp6_17 : tensor<1x32xi64> loc(#loc104) + %tmp6_19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc105) + %tmp6_20 = tt.addptr %tmp6_19, %tmp6_18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc105) + %tmp6_21 = tt.splat %xmask_3 : i1 -> tensor<1x32xi1> loc(#loc129) + %tmp6_22 = arith.andi %r0_mask_8, %tmp6_21 : tensor<1x32xi1> loc(#loc106) + %tmp6_23 = tt.load %tmp6_20, %tmp6_22, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc107) + %tmp7 = arith.trunci %tmp6_23 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109) + %tmp9_24 = arith.cmpi slt, %r0_index_7, %tmp9 : tensor<1x32xi32> loc(#loc109) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110) + %tmp11_25 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110) + %tmp11_26 = arith.select %tmp9_24, %tmp11, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112) + %tmp13_27 = arith.addi %tmp11_26, %tmp13 : tensor<1x32xi64> loc(#loc112) + %tmp14 = arith.cmpi slt, %tmp11_26, %cst_2 : tensor<1x32xi64> loc(#loc113) + %tmp15 = arith.select %tmp14, %tmp13_27, %tmp11_26 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114) + %2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45) + %3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46) + %quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130) + %remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132) + %fixed_28 = arith.subi %quot, %c1_i64 : i64 loc(#loc133) + %fixed_29 = arith.select %fixed, %fixed_28, %quot : i64 loc(#loc134) + %4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120) + %5 = arith.select %4, %fixed_29, %quot : i64 loc(#loc121) + %6 = arith.addi %5, %c1_i64 : i64 loc(#loc55) + %7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56) + %8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56) + %9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57) + %10 = arith.xori %tmp6_22, %cst_1 : tensor<1x32xi1> loc(#loc58) + %11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59) + tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc61) + %13 = tt.addptr %12, %tmp6_18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc61) + tt.store %13, %tmp7, %tmp6_22 : tensor<1x32x!tt.ptr> loc(#loc62) + %14 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc63) + %15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63) + %16 = arith.muli %ks0, %tmp6_14 : i64 loc(#loc64) + %17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122) + %18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc66) + %20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc66) + tt.store %20, %cst_0, %tmp6_22 : tensor<1x32x!tt.ptr> loc(#loc67) + } loc(#loc25) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":24:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":22:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":25:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":25:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":35:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":36:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":44:31) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:60) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:77) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:68) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:52) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:45) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:103) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":49:93) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":50:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":52:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":54:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":56:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":57:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":58:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:94) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:100) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:55) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:50) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:112) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:110) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":59:130) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":61:94) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:53) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:62) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:58) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":62:95) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/l7/cl7uoyo4r2qj6bwjgyfp2qoiszzxw4xmxolbpem6a4fujmllivos.py":43:4) +#loc78 = loc("xmask"(#loc1)) +#loc79 = loc("xoffset"(#loc3)) +#loc80 = loc("r0_base"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("_tmp3"(#loc6)) +#loc83 = loc("r0_index"(#loc7)) +#loc84 = loc("r0_mask"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp1"(#loc14)) +#loc91 = loc("tmp4"(#loc15)) +#loc92 = loc("_tmp3"(#loc16)) +#loc94 = loc("tmp3"(#loc21)) +#loc95 = loc("tmp5"(#loc22)) +#loc96 = loc("r0_index"(#loc26)) +#loc97 = loc("r0_mask"(#loc27)) +#loc98 = loc("tmp6"(#loc28)) +#loc99 = loc("tmp6"(#loc29)) +#loc100 = loc("tmp6"(#loc30)) +#loc101 = loc("tmp6"(#loc31)) +#loc102 = loc("tmp6"(#loc32)) +#loc103 = loc("tmp6"(#loc33)) +#loc104 = loc("tmp6"(#loc34)) +#loc105 = loc("tmp6"(#loc35)) +#loc106 = loc("tmp6"(#loc36)) +#loc107 = loc("tmp6"(#loc37)) +#loc108 = loc("tmp7"(#loc38)) +#loc109 = loc("tmp9"(#loc39)) +#loc110 = loc("tmp11"(#loc40)) +#loc111 = loc("tmp12"(#loc41)) +#loc112 = loc("tmp13"(#loc42)) +#loc113 = loc("tmp14"(#loc43)) +#loc114 = loc("tmp15"(#loc44)) +#loc115 = loc("quot"(#loc47)) +#loc116 = loc("remainder"(#loc49)) +#loc117 = loc("fixed"(#loc50)) +#loc118 = loc("fixed"(#loc51)) +#loc119 = loc("fixed"(#loc52)) +#loc120 = loc(callsite(#loc53 at #loc48)) +#loc121 = loc(callsite(#loc54 at #loc48)) +#loc122 = loc(fused[#loc65, #loc64]) +#loc123 = loc(fused[#loc86, #loc85]) +#loc124 = loc(fused[#loc88, #loc78]) +#loc125 = loc(callsite(#loc18 at #loc93)) +#loc127 = loc(fused[#loc101, #loc102]) +#loc128 = loc(fused[#loc104, #loc103]) +#loc129 = loc(fused[#loc106, #loc78]) +#loc130 = loc(callsite(#loc115 at #loc48)) +#loc131 = loc(callsite(#loc116 at #loc48)) +#loc132 = loc(callsite(#loc117 at #loc48)) +#loc133 = loc(callsite(#loc118 at #loc48)) +#loc134 = loc(callsite(#loc119 at #loc48)) +#loc135 = loc(callsite(#loc20 at #loc125)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..025db0ac5d03ad4b4b526d39ab652e667271ee59 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..acb939e23d6e8a5dec5c91ca1fc4cd21fd77679a Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9b69ae2a32b055467b8df54df6adaf9195b25019 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "587c69202d8b9283f3a0f93e68e87b62970646f1b71224a837ea142a61b3a13d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..21ec6ef35e757515a96eb344cc9a6c100898ad6e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.llir @@ -0,0 +1,1187 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %11 = shl i32 %9, 3, !dbg !9 + %12 = or disjoint i32 %11, 1, !dbg !10 + %13 = or disjoint i32 %11, 2, !dbg !10 + %14 = or disjoint i32 %11, 3, !dbg !10 + %15 = insertelement <4 x i32> poison, i32 %11, i64 0, !dbg !10 + %16 = shufflevector <4 x i32> %15, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !10 + %17 = or disjoint <4 x i32> %16, , !dbg !10 + %18 = insertelement <8 x i32> poison, i32 %14, i64 4, !dbg !11 + %19 = insertelement <8 x i32> %18, i32 %13, i64 5, !dbg !11 + %20 = insertelement <8 x i32> %19, i32 %12, i64 6, !dbg !11 + %21 = insertelement <8 x i32> %20, i32 %11, i64 7, !dbg !11 + %22 = shufflevector <4 x i32> %17, <4 x i32> poison, <8 x i32> , !dbg !11 + %23 = shufflevector <8 x i32> %22, <8 x i32> %21, <8 x i32> , !dbg !11 + %24 = insertelement <8 x i32> poison, i32 %4, i64 0, !dbg !11 + %25 = shufflevector <8 x i32> %24, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !11 + %26 = icmp slt <8 x i32> %23, %25, !dbg !11 + %27 = and i32 %10, 511, !dbg !12 + %28 = sext i32 %11 to i64, !dbg !13 + %29 = sext i32 %12 to i64, !dbg !13 + %30 = sext i32 %13 to i64, !dbg !13 + %31 = sext i32 %14 to i64, !dbg !13 + %32 = extractelement <4 x i32> %17, i64 3, !dbg !13 + %33 = sext i32 %32 to i64, !dbg !13 + %34 = extractelement <4 x i32> %17, i64 2, !dbg !13 + %35 = sext i32 %34 to i64, !dbg !13 + %36 = extractelement <4 x i32> %17, i64 1, !dbg !13 + %37 = sext i32 %36 to i64, !dbg !13 + %38 = extractelement <4 x i32> %17, i64 0, !dbg !13 + %39 = sext i32 %38 to i64, !dbg !13 + %.frozen = freeze i64 %2, !dbg !14 + %40 = sdiv i64 %28, %.frozen, !dbg !14 + %41 = mul i64 %40, %.frozen, !dbg !13 + %.decomposed = sub i64 %28, %41, !dbg !13 + %.frozen70 = freeze i64 %2, !dbg !14 + %42 = sdiv i64 %29, %.frozen70, !dbg !14 + %43 = mul i64 %42, %.frozen70, !dbg !13 + %.decomposed71 = sub i64 %29, %43, !dbg !13 + %.frozen72 = freeze i64 %2, !dbg !14 + %44 = sdiv i64 %30, %.frozen72, !dbg !14 + %45 = mul i64 %44, %.frozen72, !dbg !13 + %.decomposed73 = sub i64 %30, %45, !dbg !13 + %.frozen74 = freeze i64 %2, !dbg !14 + %46 = sdiv i64 %31, %.frozen74, !dbg !14 + %47 = mul i64 %46, %.frozen74, !dbg !13 + %.decomposed75 = sub i64 %31, %47, !dbg !13 + %.frozen76 = freeze i64 %33, !dbg !14 + %.frozen77 = freeze i64 %2, !dbg !14 + %48 = sdiv i64 %.frozen76, %.frozen77, !dbg !14 + %49 = mul i64 %48, %.frozen77, !dbg !13 + %.decomposed78 = sub i64 %.frozen76, %49, !dbg !13 + %.frozen79 = freeze i64 %35, !dbg !14 + %.frozen80 = freeze i64 %2, !dbg !14 + %50 = sdiv i64 %.frozen79, %.frozen80, !dbg !14 + %51 = mul i64 %50, %.frozen80, !dbg !13 + %.decomposed81 = sub i64 %.frozen79, %51, !dbg !13 + %.frozen82 = freeze i64 %37, !dbg !14 + %.frozen83 = freeze i64 %2, !dbg !14 + %52 = sdiv i64 %.frozen82, %.frozen83, !dbg !14 + %53 = mul i64 %52, %.frozen83, !dbg !13 + %.decomposed84 = sub i64 %.frozen82, %53, !dbg !13 + %.frozen85 = freeze i64 %39, !dbg !14 + %.frozen86 = freeze i64 %2, !dbg !14 + %54 = sdiv i64 %.frozen85, %.frozen86, !dbg !14 + %55 = mul i64 %54, %.frozen86, !dbg !13 + %.decomposed87 = sub i64 %.frozen85, %55, !dbg !13 + %56 = mul i64 %40, %3, !dbg !15 + %57 = mul i64 %42, %3, !dbg !15 + %58 = mul i64 %44, %3, !dbg !15 + %59 = mul i64 %46, %3, !dbg !15 + %60 = mul i64 %48, %3, !dbg !15 + %61 = mul i64 %50, %3, !dbg !15 + %62 = mul i64 %52, %3, !dbg !15 + %63 = mul i64 %54, %3, !dbg !15 + %.idx = mul nsw i64 %.decomposed, 128000 + %64 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %64, i64 %56, !dbg !16 + %.idx1 = mul nsw i64 %.decomposed71, 128000 + %65 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx1 + %invariant.gep9 = getelementptr float, ptr addrspace(1) %65, i64 %57, !dbg !16 + %.idx2 = mul nsw i64 %.decomposed73, 128000 + %66 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx2 + %invariant.gep11 = getelementptr float, ptr addrspace(1) %66, i64 %58, !dbg !16 + %.idx3 = mul nsw i64 %.decomposed75, 128000 + %67 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx3 + %invariant.gep13 = getelementptr float, ptr addrspace(1) %67, i64 %59, !dbg !16 + %.idx4 = mul nsw i64 %.decomposed78, 128000 + %68 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx4 + %invariant.gep15 = getelementptr float, ptr addrspace(1) %68, i64 %60, !dbg !16 + %.idx5 = mul nsw i64 %.decomposed81, 128000 + %69 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx5 + %invariant.gep17 = getelementptr float, ptr addrspace(1) %69, i64 %61, !dbg !16 + %.idx6 = mul nsw i64 %.decomposed84, 128000 + %70 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx6 + %invariant.gep19 = getelementptr float, ptr addrspace(1) %70, i64 %62, !dbg !16 + %.idx7 = mul nsw i64 %.decomposed87, 128000 + %71 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx7 + %invariant.gep21 = getelementptr float, ptr addrspace(1) %71, i64 %63, !dbg !16 + %72 = zext nneg i32 %27 to i64, !dbg !16 + br label %73, !dbg !16 + +73: ; preds = %8, %73 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %73 ] + %74 = phi <8 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %134, %73 ] + %75 = phi <8 x i32> [ splat (i32 2147483647), %8 ], [ %135, %73 ] + %76 = or disjoint i64 %indvars.iv, %72, !dbg !17 + %77 = icmp samesign ult i64 %76, 32000, !dbg !18 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %76, !dbg !19 + %gep10 = getelementptr float, ptr addrspace(1) %invariant.gep9, i64 %76, !dbg !19 + %gep12 = getelementptr float, ptr addrspace(1) %invariant.gep11, i64 %76, !dbg !19 + %gep14 = getelementptr float, ptr addrspace(1) %invariant.gep13, i64 %76, !dbg !19 + %gep16 = getelementptr float, ptr addrspace(1) %invariant.gep15, i64 %76, !dbg !19 + %gep18 = getelementptr float, ptr addrspace(1) %invariant.gep17, i64 %76, !dbg !19 + %gep20 = getelementptr float, ptr addrspace(1) %invariant.gep19, i64 %76, !dbg !19 + %gep22 = getelementptr float, ptr addrspace(1) %invariant.gep21, i64 %76, !dbg !19 + %78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %79 = fcmp uno <8 x float> %74, zeroinitializer, !dbg !21 + %80 = trunc nuw nsw i64 %76 to i32, !dbg !25 + %81 = insertelement <8 x i1> poison, i1 %77, i64 0, !dbg !26 + %82 = shufflevector <8 x i1> %81, <8 x i1> poison, <8 x i32> zeroinitializer, !dbg !26 + %83 = and <8 x i1> %26, %82, !dbg !26 + %84 = extractelement <8 x i1> %83, i64 7, !dbg !20 + %85 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %78, i1 %84) #4, !dbg !20 + %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %87 = extractelement <8 x i1> %83, i64 6, !dbg !20 + %88 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep10, i64 %86, i1 %87) #4, !dbg !20 + %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %90 = extractelement <8 x i1> %83, i64 5, !dbg !20 + %91 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep12, i64 %89, i1 %90) #4, !dbg !20 + %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %93 = extractelement <8 x i1> %83, i64 4, !dbg !20 + %94 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep14, i64 %92, i1 %93) #4, !dbg !20 + %95 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %96 = extractelement <8 x i1> %83, i64 3, !dbg !20 + %97 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep16, i64 %95, i1 %96) #4, !dbg !20 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %99 = extractelement <8 x i1> %83, i64 2, !dbg !20 + %100 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep18, i64 %98, i1 %99) #4, !dbg !20 + %101 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %102 = extractelement <8 x i1> %83, i64 1, !dbg !20 + %103 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep20, i64 %101, i1 %102) #4, !dbg !20 + %104 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %105 = extractelement <8 x i1> %83, i64 0, !dbg !20 + %106 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep22, i64 %104, i1 %105) #4, !dbg !20 + %107 = insertelement <8 x i32> poison, i32 %106, i64 0, !dbg !20 + %108 = insertelement <8 x i32> %107, i32 %103, i64 1, !dbg !20 + %109 = insertelement <8 x i32> %108, i32 %100, i64 2, !dbg !20 + %110 = insertelement <8 x i32> %109, i32 %97, i64 3, !dbg !20 + %111 = insertelement <8 x i32> %110, i32 %94, i64 4, !dbg !20 + %112 = insertelement <8 x i32> %111, i32 %91, i64 5, !dbg !20 + %113 = insertelement <8 x i32> %112, i32 %88, i64 6, !dbg !20 + %114 = insertelement <8 x i32> %113, i32 %85, i64 7, !dbg !20 + %115 = bitcast <8 x i32> %114 to <8 x float>, !dbg !20 + %116 = fcmp ogt <8 x float> %74, %115, !dbg !27 + %117 = fcmp oeq <8 x float> %74, %115, !dbg !28 + %118 = fcmp uno <8 x float> %115, zeroinitializer, !dbg !29 + %119 = xor <8 x i1> %118, splat (i1 true), !dbg !30 + %120 = and <8 x i1> %79, %119, !dbg !31 + %121 = or <8 x i1> %116, %120, !dbg !32 + %122 = and <8 x i1> %79, %118, !dbg !33 + %123 = or <8 x i1> %117, %122, !dbg !34 + %124 = insertelement <8 x i64> poison, i64 %76, i64 0, !dbg !35 + %125 = shufflevector <8 x i64> %124, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !35 + %126 = sext <8 x i32> %75 to <8 x i64>, !dbg !35 + %127 = icmp sgt <8 x i64> %125, %126, !dbg !35 + %128 = and <8 x i1> %127, %123, !dbg !36 + %129 = or <8 x i1> %121, %128, !dbg !37 + %130 = select <8 x i1> %129, <8 x float> %74, <8 x float> %115, !dbg !38 + %131 = insertelement <8 x i32> poison, i32 %80, i64 0, !dbg !25 + %132 = shufflevector <8 x i32> %131, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !25 + %133 = select <8 x i1> %129, <8 x i32> %75, <8 x i32> %132, !dbg !25 + %134 = select <8 x i1> %83, <8 x float> %130, <8 x float> %74, !dbg !39 + %135 = select <8 x i1> %83, <8 x i32> %133, <8 x i32> %75, !dbg !40 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 512, !dbg !16 + %136 = icmp samesign ult i64 %indvars.iv, 31488, !dbg !16 + br i1 %136, label %73, label %137, !dbg !16 + +137: ; preds = %73 + %138 = and i32 %10, 7, !dbg !8 + %139 = or disjoint i32 %11, %138, !dbg !10 + %140 = icmp slt i32 %139, %4, !dbg !11 + %141 = and i32 %10, 31, !dbg !8 + %142 = lshr i32 %10, 5, !dbg !8 + %143 = extractelement <8 x float> %134, i64 7, !dbg !41 + %144 = bitcast float %143 to i32, !dbg !41 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 16, i32 31), !dbg !41 + %146 = bitcast i32 %145 to float, !dbg !41 + %147 = extractelement <8 x i32> %135, i64 7, !dbg !41 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 16, i32 31), !dbg !41 + %149 = fcmp ogt float %143, %146, !dbg !43 + %150 = fcmp oeq float %143, %146, !dbg !44 + %151 = fcmp uno <8 x float> %134, zeroinitializer, !dbg !45 + %152 = fcmp uno float %146, 0.000000e+00, !dbg !46 + %153 = xor i1 %152, true, !dbg !47 + %154 = extractelement <8 x i1> %151, i64 7, !dbg !48 + %155 = and i1 %154, %153, !dbg !49 + %156 = or i1 %149, %155, !dbg !50 + %157 = and i1 %154, %152, !dbg !48 + %158 = or i1 %150, %157, !dbg !51 + %159 = icmp slt i32 %147, %148, !dbg !52 + %160 = and i1 %159, %158, !dbg !53 + %161 = or i1 %156, %160, !dbg !54 + %162 = select i1 %161, float %143, float %146, !dbg !55 + %163 = select i1 %161, i32 %147, i32 %148, !dbg !56 + %164 = bitcast float %162 to i32, !dbg !41 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !41 + %166 = bitcast i32 %165 to float, !dbg !41 + %167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 8, i32 31), !dbg !41 + %168 = fcmp ogt float %162, %166, !dbg !43 + %169 = fcmp oeq float %162, %166, !dbg !44 + %170 = fcmp uno float %162, 0.000000e+00, !dbg !45 + %171 = fcmp uno float %166, 0.000000e+00, !dbg !46 + %172 = xor i1 %171, true, !dbg !47 + %173 = and i1 %170, %172, !dbg !49 + %174 = or i1 %168, %173, !dbg !50 + %175 = and i1 %171, %170, !dbg !48 + %176 = or i1 %169, %175, !dbg !51 + %177 = icmp slt i32 %163, %167, !dbg !52 + %178 = and i1 %177, %176, !dbg !53 + %179 = or i1 %174, %178, !dbg !54 + %180 = select i1 %179, float %162, float %166, !dbg !55 + %181 = select i1 %179, i32 %163, i32 %167, !dbg !56 + %182 = bitcast float %180 to i32, !dbg !41 + %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !41 + %184 = bitcast i32 %183 to float, !dbg !41 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 4, i32 31), !dbg !41 + %186 = fcmp ogt float %180, %184, !dbg !43 + %187 = fcmp oeq float %180, %184, !dbg !44 + %188 = fcmp uno float %180, 0.000000e+00, !dbg !45 + %189 = fcmp uno float %184, 0.000000e+00, !dbg !46 + %190 = xor i1 %189, true, !dbg !47 + %191 = and i1 %188, %190, !dbg !49 + %192 = or i1 %186, %191, !dbg !50 + %193 = and i1 %189, %188, !dbg !48 + %194 = or i1 %187, %193, !dbg !51 + %195 = icmp slt i32 %181, %185, !dbg !52 + %196 = and i1 %195, %194, !dbg !53 + %197 = or i1 %192, %196, !dbg !54 + %198 = select i1 %197, float %180, float %184, !dbg !55 + %199 = select i1 %197, i32 %181, i32 %185, !dbg !56 + %200 = bitcast float %198 to i32, !dbg !41 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 2, i32 31), !dbg !41 + %202 = bitcast i32 %201 to float, !dbg !41 + %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 2, i32 31), !dbg !41 + %204 = fcmp ogt float %198, %202, !dbg !43 + %205 = fcmp oeq float %198, %202, !dbg !44 + %206 = fcmp uno float %198, 0.000000e+00, !dbg !45 + %207 = fcmp uno float %202, 0.000000e+00, !dbg !46 + %208 = xor i1 %207, true, !dbg !47 + %209 = and i1 %206, %208, !dbg !49 + %210 = or i1 %204, %209, !dbg !50 + %211 = and i1 %207, %206, !dbg !48 + %212 = or i1 %205, %211, !dbg !51 + %213 = icmp slt i32 %199, %203, !dbg !52 + %214 = and i1 %213, %212, !dbg !53 + %215 = or i1 %210, %214, !dbg !54 + %216 = select i1 %215, float %198, float %202, !dbg !55 + %217 = select i1 %215, i32 %199, i32 %203, !dbg !56 + %218 = bitcast float %216 to i32, !dbg !41 + %219 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 1, i32 31), !dbg !41 + %220 = bitcast i32 %219 to float, !dbg !41 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !41 + %222 = fcmp ogt float %216, %220, !dbg !43 + %223 = fcmp oeq float %216, %220, !dbg !44 + %224 = fcmp uno float %216, 0.000000e+00, !dbg !45 + %225 = fcmp uno float %220, 0.000000e+00, !dbg !46 + %226 = xor i1 %225, true, !dbg !47 + %227 = and i1 %224, %226, !dbg !49 + %228 = or i1 %222, %227, !dbg !50 + %229 = and i1 %225, %224, !dbg !48 + %230 = or i1 %223, %229, !dbg !51 + %231 = icmp slt i32 %217, %221, !dbg !52 + %232 = and i1 %231, %230, !dbg !53 + %233 = or i1 %228, %232, !dbg !54 + %234 = select i1 %233, i32 %217, i32 %221, !dbg !56 + %235 = extractelement <8 x float> %134, i64 6, !dbg !41 + %236 = bitcast float %235 to i32, !dbg !41 + %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 16, i32 31), !dbg !41 + %238 = bitcast i32 %237 to float, !dbg !41 + %239 = extractelement <8 x i32> %135, i64 6, !dbg !41 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 16, i32 31), !dbg !41 + %241 = fcmp ogt float %235, %238, !dbg !43 + %242 = fcmp oeq float %235, %238, !dbg !44 + %243 = fcmp uno float %238, 0.000000e+00, !dbg !46 + %244 = xor i1 %243, true, !dbg !47 + %245 = extractelement <8 x i1> %151, i64 6, !dbg !48 + %246 = and i1 %245, %244, !dbg !49 + %247 = or i1 %241, %246, !dbg !50 + %248 = and i1 %245, %243, !dbg !48 + %249 = or i1 %242, %248, !dbg !51 + %250 = icmp slt i32 %239, %240, !dbg !52 + %251 = and i1 %250, %249, !dbg !53 + %252 = or i1 %247, %251, !dbg !54 + %253 = select i1 %252, float %235, float %238, !dbg !55 + %254 = select i1 %252, i32 %239, i32 %240, !dbg !56 + %255 = bitcast float %253 to i32, !dbg !41 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 8, i32 31), !dbg !41 + %257 = bitcast i32 %256 to float, !dbg !41 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %254, i32 8, i32 31), !dbg !41 + %259 = fcmp ogt float %253, %257, !dbg !43 + %260 = fcmp oeq float %253, %257, !dbg !44 + %261 = fcmp uno float %253, 0.000000e+00, !dbg !45 + %262 = fcmp uno float %257, 0.000000e+00, !dbg !46 + %263 = xor i1 %262, true, !dbg !47 + %264 = and i1 %261, %263, !dbg !49 + %265 = or i1 %259, %264, !dbg !50 + %266 = and i1 %262, %261, !dbg !48 + %267 = or i1 %260, %266, !dbg !51 + %268 = icmp slt i32 %254, %258, !dbg !52 + %269 = and i1 %268, %267, !dbg !53 + %270 = or i1 %265, %269, !dbg !54 + %271 = select i1 %270, float %253, float %257, !dbg !55 + %272 = select i1 %270, i32 %254, i32 %258, !dbg !56 + %273 = bitcast float %271 to i32, !dbg !41 + %274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 4, i32 31), !dbg !41 + %275 = bitcast i32 %274 to float, !dbg !41 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !41 + %277 = fcmp ogt float %271, %275, !dbg !43 + %278 = fcmp oeq float %271, %275, !dbg !44 + %279 = fcmp uno float %271, 0.000000e+00, !dbg !45 + %280 = fcmp uno float %275, 0.000000e+00, !dbg !46 + %281 = xor i1 %280, true, !dbg !47 + %282 = and i1 %279, %281, !dbg !49 + %283 = or i1 %277, %282, !dbg !50 + %284 = and i1 %280, %279, !dbg !48 + %285 = or i1 %278, %284, !dbg !51 + %286 = icmp slt i32 %272, %276, !dbg !52 + %287 = and i1 %286, %285, !dbg !53 + %288 = or i1 %283, %287, !dbg !54 + %289 = select i1 %288, float %271, float %275, !dbg !55 + %290 = select i1 %288, i32 %272, i32 %276, !dbg !56 + %291 = bitcast float %289 to i32, !dbg !41 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 2, i32 31), !dbg !41 + %293 = bitcast i32 %292 to float, !dbg !41 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !41 + %295 = fcmp ogt float %289, %293, !dbg !43 + %296 = fcmp oeq float %289, %293, !dbg !44 + %297 = fcmp uno float %289, 0.000000e+00, !dbg !45 + %298 = fcmp uno float %293, 0.000000e+00, !dbg !46 + %299 = xor i1 %298, true, !dbg !47 + %300 = and i1 %297, %299, !dbg !49 + %301 = or i1 %295, %300, !dbg !50 + %302 = and i1 %298, %297, !dbg !48 + %303 = or i1 %296, %302, !dbg !51 + %304 = icmp slt i32 %290, %294, !dbg !52 + %305 = and i1 %304, %303, !dbg !53 + %306 = or i1 %301, %305, !dbg !54 + %307 = select i1 %306, float %289, float %293, !dbg !55 + %308 = select i1 %306, i32 %290, i32 %294, !dbg !56 + %309 = bitcast float %307 to i32, !dbg !41 + %310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 1, i32 31), !dbg !41 + %311 = bitcast i32 %310 to float, !dbg !41 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !41 + %313 = fcmp ogt float %307, %311, !dbg !43 + %314 = fcmp oeq float %307, %311, !dbg !44 + %315 = fcmp uno float %307, 0.000000e+00, !dbg !45 + %316 = fcmp uno float %311, 0.000000e+00, !dbg !46 + %317 = xor i1 %316, true, !dbg !47 + %318 = and i1 %315, %317, !dbg !49 + %319 = or i1 %313, %318, !dbg !50 + %320 = and i1 %316, %315, !dbg !48 + %321 = or i1 %314, %320, !dbg !51 + %322 = icmp slt i32 %308, %312, !dbg !52 + %323 = and i1 %322, %321, !dbg !53 + %324 = or i1 %319, %323, !dbg !54 + %325 = select i1 %324, i32 %308, i32 %312, !dbg !56 + %326 = extractelement <8 x float> %134, i64 5, !dbg !41 + %327 = bitcast float %326 to i32, !dbg !41 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 16, i32 31), !dbg !41 + %329 = bitcast i32 %328 to float, !dbg !41 + %330 = extractelement <8 x i32> %135, i64 5, !dbg !41 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 16, i32 31), !dbg !41 + %332 = fcmp ogt float %326, %329, !dbg !43 + %333 = fcmp oeq float %326, %329, !dbg !44 + %334 = fcmp uno float %329, 0.000000e+00, !dbg !46 + %335 = xor i1 %334, true, !dbg !47 + %336 = extractelement <8 x i1> %151, i64 5, !dbg !48 + %337 = and i1 %336, %335, !dbg !49 + %338 = or i1 %332, %337, !dbg !50 + %339 = and i1 %336, %334, !dbg !48 + %340 = or i1 %333, %339, !dbg !51 + %341 = icmp slt i32 %330, %331, !dbg !52 + %342 = and i1 %341, %340, !dbg !53 + %343 = or i1 %338, %342, !dbg !54 + %344 = select i1 %343, float %326, float %329, !dbg !55 + %345 = select i1 %343, i32 %330, i32 %331, !dbg !56 + %346 = bitcast float %344 to i32, !dbg !41 + %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 8, i32 31), !dbg !41 + %348 = bitcast i32 %347 to float, !dbg !41 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 8, i32 31), !dbg !41 + %350 = fcmp ogt float %344, %348, !dbg !43 + %351 = fcmp oeq float %344, %348, !dbg !44 + %352 = fcmp uno float %344, 0.000000e+00, !dbg !45 + %353 = fcmp uno float %348, 0.000000e+00, !dbg !46 + %354 = xor i1 %353, true, !dbg !47 + %355 = and i1 %352, %354, !dbg !49 + %356 = or i1 %350, %355, !dbg !50 + %357 = and i1 %353, %352, !dbg !48 + %358 = or i1 %351, %357, !dbg !51 + %359 = icmp slt i32 %345, %349, !dbg !52 + %360 = and i1 %359, %358, !dbg !53 + %361 = or i1 %356, %360, !dbg !54 + %362 = select i1 %361, float %344, float %348, !dbg !55 + %363 = select i1 %361, i32 %345, i32 %349, !dbg !56 + %364 = bitcast float %362 to i32, !dbg !41 + %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 4, i32 31), !dbg !41 + %366 = bitcast i32 %365 to float, !dbg !41 + %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 4, i32 31), !dbg !41 + %368 = fcmp ogt float %362, %366, !dbg !43 + %369 = fcmp oeq float %362, %366, !dbg !44 + %370 = fcmp uno float %362, 0.000000e+00, !dbg !45 + %371 = fcmp uno float %366, 0.000000e+00, !dbg !46 + %372 = xor i1 %371, true, !dbg !47 + %373 = and i1 %370, %372, !dbg !49 + %374 = or i1 %368, %373, !dbg !50 + %375 = and i1 %371, %370, !dbg !48 + %376 = or i1 %369, %375, !dbg !51 + %377 = icmp slt i32 %363, %367, !dbg !52 + %378 = and i1 %377, %376, !dbg !53 + %379 = or i1 %374, %378, !dbg !54 + %380 = select i1 %379, float %362, float %366, !dbg !55 + %381 = select i1 %379, i32 %363, i32 %367, !dbg !56 + %382 = bitcast float %380 to i32, !dbg !41 + %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 2, i32 31), !dbg !41 + %384 = bitcast i32 %383 to float, !dbg !41 + %385 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %381, i32 2, i32 31), !dbg !41 + %386 = fcmp ogt float %380, %384, !dbg !43 + %387 = fcmp oeq float %380, %384, !dbg !44 + %388 = fcmp uno float %380, 0.000000e+00, !dbg !45 + %389 = fcmp uno float %384, 0.000000e+00, !dbg !46 + %390 = xor i1 %389, true, !dbg !47 + %391 = and i1 %388, %390, !dbg !49 + %392 = or i1 %386, %391, !dbg !50 + %393 = and i1 %389, %388, !dbg !48 + %394 = or i1 %387, %393, !dbg !51 + %395 = icmp slt i32 %381, %385, !dbg !52 + %396 = and i1 %395, %394, !dbg !53 + %397 = or i1 %392, %396, !dbg !54 + %398 = select i1 %397, float %380, float %384, !dbg !55 + %399 = select i1 %397, i32 %381, i32 %385, !dbg !56 + %400 = bitcast float %398 to i32, !dbg !41 + %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 1, i32 31), !dbg !41 + %402 = bitcast i32 %401 to float, !dbg !41 + %403 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %399, i32 1, i32 31), !dbg !41 + %404 = fcmp ogt float %398, %402, !dbg !43 + %405 = fcmp oeq float %398, %402, !dbg !44 + %406 = fcmp uno float %398, 0.000000e+00, !dbg !45 + %407 = fcmp uno float %402, 0.000000e+00, !dbg !46 + %408 = xor i1 %407, true, !dbg !47 + %409 = and i1 %406, %408, !dbg !49 + %410 = or i1 %404, %409, !dbg !50 + %411 = and i1 %407, %406, !dbg !48 + %412 = or i1 %405, %411, !dbg !51 + %413 = icmp slt i32 %399, %403, !dbg !52 + %414 = and i1 %413, %412, !dbg !53 + %415 = or i1 %410, %414, !dbg !54 + %416 = select i1 %415, i32 %399, i32 %403, !dbg !56 + %417 = extractelement <8 x float> %134, i64 4, !dbg !41 + %418 = bitcast float %417 to i32, !dbg !41 + %419 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %418, i32 16, i32 31), !dbg !41 + %420 = bitcast i32 %419 to float, !dbg !41 + %421 = extractelement <8 x i32> %135, i64 4, !dbg !41 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 16, i32 31), !dbg !41 + %423 = fcmp ogt float %417, %420, !dbg !43 + %424 = fcmp oeq float %417, %420, !dbg !44 + %425 = fcmp uno float %420, 0.000000e+00, !dbg !46 + %426 = xor i1 %425, true, !dbg !47 + %427 = extractelement <8 x i1> %151, i64 4, !dbg !48 + %428 = and i1 %427, %426, !dbg !49 + %429 = or i1 %423, %428, !dbg !50 + %430 = and i1 %427, %425, !dbg !48 + %431 = or i1 %424, %430, !dbg !51 + %432 = icmp slt i32 %421, %422, !dbg !52 + %433 = and i1 %432, %431, !dbg !53 + %434 = or i1 %429, %433, !dbg !54 + %435 = select i1 %434, float %417, float %420, !dbg !55 + %436 = select i1 %434, i32 %421, i32 %422, !dbg !56 + %437 = bitcast float %435 to i32, !dbg !41 + %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 8, i32 31), !dbg !41 + %439 = bitcast i32 %438 to float, !dbg !41 + %440 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %436, i32 8, i32 31), !dbg !41 + %441 = fcmp ogt float %435, %439, !dbg !43 + %442 = fcmp oeq float %435, %439, !dbg !44 + %443 = fcmp uno float %435, 0.000000e+00, !dbg !45 + %444 = fcmp uno float %439, 0.000000e+00, !dbg !46 + %445 = xor i1 %444, true, !dbg !47 + %446 = and i1 %443, %445, !dbg !49 + %447 = or i1 %441, %446, !dbg !50 + %448 = and i1 %444, %443, !dbg !48 + %449 = or i1 %442, %448, !dbg !51 + %450 = icmp slt i32 %436, %440, !dbg !52 + %451 = and i1 %450, %449, !dbg !53 + %452 = or i1 %447, %451, !dbg !54 + %453 = select i1 %452, float %435, float %439, !dbg !55 + %454 = select i1 %452, i32 %436, i32 %440, !dbg !56 + %455 = bitcast float %453 to i32, !dbg !41 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 4, i32 31), !dbg !41 + %457 = bitcast i32 %456 to float, !dbg !41 + %458 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 4, i32 31), !dbg !41 + %459 = fcmp ogt float %453, %457, !dbg !43 + %460 = fcmp oeq float %453, %457, !dbg !44 + %461 = fcmp uno float %453, 0.000000e+00, !dbg !45 + %462 = fcmp uno float %457, 0.000000e+00, !dbg !46 + %463 = xor i1 %462, true, !dbg !47 + %464 = and i1 %461, %463, !dbg !49 + %465 = or i1 %459, %464, !dbg !50 + %466 = and i1 %462, %461, !dbg !48 + %467 = or i1 %460, %466, !dbg !51 + %468 = icmp slt i32 %454, %458, !dbg !52 + %469 = and i1 %468, %467, !dbg !53 + %470 = or i1 %465, %469, !dbg !54 + %471 = select i1 %470, float %453, float %457, !dbg !55 + %472 = select i1 %470, i32 %454, i32 %458, !dbg !56 + %473 = bitcast float %471 to i32, !dbg !41 + %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 2, i32 31), !dbg !41 + %475 = bitcast i32 %474 to float, !dbg !41 + %476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %472, i32 2, i32 31), !dbg !41 + %477 = fcmp ogt float %471, %475, !dbg !43 + %478 = fcmp oeq float %471, %475, !dbg !44 + %479 = fcmp uno float %471, 0.000000e+00, !dbg !45 + %480 = fcmp uno float %475, 0.000000e+00, !dbg !46 + %481 = xor i1 %480, true, !dbg !47 + %482 = and i1 %479, %481, !dbg !49 + %483 = or i1 %477, %482, !dbg !50 + %484 = and i1 %480, %479, !dbg !48 + %485 = or i1 %478, %484, !dbg !51 + %486 = icmp slt i32 %472, %476, !dbg !52 + %487 = and i1 %486, %485, !dbg !53 + %488 = or i1 %483, %487, !dbg !54 + %489 = select i1 %488, float %471, float %475, !dbg !55 + %490 = select i1 %488, i32 %472, i32 %476, !dbg !56 + %491 = bitcast float %489 to i32, !dbg !41 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 1, i32 31), !dbg !41 + %493 = bitcast i32 %492 to float, !dbg !41 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %490, i32 1, i32 31), !dbg !41 + %495 = fcmp ogt float %489, %493, !dbg !43 + %496 = fcmp oeq float %489, %493, !dbg !44 + %497 = fcmp uno float %489, 0.000000e+00, !dbg !45 + %498 = fcmp uno float %493, 0.000000e+00, !dbg !46 + %499 = xor i1 %498, true, !dbg !47 + %500 = and i1 %497, %499, !dbg !49 + %501 = or i1 %495, %500, !dbg !50 + %502 = and i1 %498, %497, !dbg !48 + %503 = or i1 %496, %502, !dbg !51 + %504 = icmp slt i32 %490, %494, !dbg !52 + %505 = and i1 %504, %503, !dbg !53 + %506 = or i1 %501, %505, !dbg !54 + %507 = select i1 %506, i32 %490, i32 %494, !dbg !56 + %508 = extractelement <8 x float> %134, i64 3, !dbg !41 + %509 = bitcast float %508 to i32, !dbg !41 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 16, i32 31), !dbg !41 + %511 = bitcast i32 %510 to float, !dbg !41 + %512 = extractelement <8 x i32> %135, i64 3, !dbg !41 + %513 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %512, i32 16, i32 31), !dbg !41 + %514 = fcmp ogt float %508, %511, !dbg !43 + %515 = fcmp oeq float %508, %511, !dbg !44 + %516 = fcmp uno float %511, 0.000000e+00, !dbg !46 + %517 = xor i1 %516, true, !dbg !47 + %518 = extractelement <8 x i1> %151, i64 3, !dbg !48 + %519 = and i1 %518, %517, !dbg !49 + %520 = or i1 %514, %519, !dbg !50 + %521 = and i1 %518, %516, !dbg !48 + %522 = or i1 %515, %521, !dbg !51 + %523 = icmp slt i32 %512, %513, !dbg !52 + %524 = and i1 %523, %522, !dbg !53 + %525 = or i1 %520, %524, !dbg !54 + %526 = select i1 %525, float %508, float %511, !dbg !55 + %527 = select i1 %525, i32 %512, i32 %513, !dbg !56 + %528 = bitcast float %526 to i32, !dbg !41 + %529 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %528, i32 8, i32 31), !dbg !41 + %530 = bitcast i32 %529 to float, !dbg !41 + %531 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %527, i32 8, i32 31), !dbg !41 + %532 = fcmp ogt float %526, %530, !dbg !43 + %533 = fcmp oeq float %526, %530, !dbg !44 + %534 = fcmp uno float %526, 0.000000e+00, !dbg !45 + %535 = fcmp uno float %530, 0.000000e+00, !dbg !46 + %536 = xor i1 %535, true, !dbg !47 + %537 = and i1 %534, %536, !dbg !49 + %538 = or i1 %532, %537, !dbg !50 + %539 = and i1 %535, %534, !dbg !48 + %540 = or i1 %533, %539, !dbg !51 + %541 = icmp slt i32 %527, %531, !dbg !52 + %542 = and i1 %541, %540, !dbg !53 + %543 = or i1 %538, %542, !dbg !54 + %544 = select i1 %543, float %526, float %530, !dbg !55 + %545 = select i1 %543, i32 %527, i32 %531, !dbg !56 + %546 = bitcast float %544 to i32, !dbg !41 + %547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 4, i32 31), !dbg !41 + %548 = bitcast i32 %547 to float, !dbg !41 + %549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %545, i32 4, i32 31), !dbg !41 + %550 = fcmp ogt float %544, %548, !dbg !43 + %551 = fcmp oeq float %544, %548, !dbg !44 + %552 = fcmp uno float %544, 0.000000e+00, !dbg !45 + %553 = fcmp uno float %548, 0.000000e+00, !dbg !46 + %554 = xor i1 %553, true, !dbg !47 + %555 = and i1 %552, %554, !dbg !49 + %556 = or i1 %550, %555, !dbg !50 + %557 = and i1 %553, %552, !dbg !48 + %558 = or i1 %551, %557, !dbg !51 + %559 = icmp slt i32 %545, %549, !dbg !52 + %560 = and i1 %559, %558, !dbg !53 + %561 = or i1 %556, %560, !dbg !54 + %562 = select i1 %561, float %544, float %548, !dbg !55 + %563 = select i1 %561, i32 %545, i32 %549, !dbg !56 + %564 = bitcast float %562 to i32, !dbg !41 + %565 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %564, i32 2, i32 31), !dbg !41 + %566 = bitcast i32 %565 to float, !dbg !41 + %567 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %563, i32 2, i32 31), !dbg !41 + %568 = fcmp ogt float %562, %566, !dbg !43 + %569 = fcmp oeq float %562, %566, !dbg !44 + %570 = fcmp uno float %562, 0.000000e+00, !dbg !45 + %571 = fcmp uno float %566, 0.000000e+00, !dbg !46 + %572 = xor i1 %571, true, !dbg !47 + %573 = and i1 %570, %572, !dbg !49 + %574 = or i1 %568, %573, !dbg !50 + %575 = and i1 %571, %570, !dbg !48 + %576 = or i1 %569, %575, !dbg !51 + %577 = icmp slt i32 %563, %567, !dbg !52 + %578 = and i1 %577, %576, !dbg !53 + %579 = or i1 %574, %578, !dbg !54 + %580 = select i1 %579, float %562, float %566, !dbg !55 + %581 = select i1 %579, i32 %563, i32 %567, !dbg !56 + %582 = bitcast float %580 to i32, !dbg !41 + %583 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %582, i32 1, i32 31), !dbg !41 + %584 = bitcast i32 %583 to float, !dbg !41 + %585 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %581, i32 1, i32 31), !dbg !41 + %586 = fcmp ogt float %580, %584, !dbg !43 + %587 = fcmp oeq float %580, %584, !dbg !44 + %588 = fcmp uno float %580, 0.000000e+00, !dbg !45 + %589 = fcmp uno float %584, 0.000000e+00, !dbg !46 + %590 = xor i1 %589, true, !dbg !47 + %591 = and i1 %588, %590, !dbg !49 + %592 = or i1 %586, %591, !dbg !50 + %593 = and i1 %589, %588, !dbg !48 + %594 = or i1 %587, %593, !dbg !51 + %595 = icmp slt i32 %581, %585, !dbg !52 + %596 = and i1 %595, %594, !dbg !53 + %597 = or i1 %592, %596, !dbg !54 + %598 = select i1 %597, i32 %581, i32 %585, !dbg !56 + %599 = extractelement <8 x float> %134, i64 2, !dbg !41 + %600 = bitcast float %599 to i32, !dbg !41 + %601 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %600, i32 16, i32 31), !dbg !41 + %602 = bitcast i32 %601 to float, !dbg !41 + %603 = extractelement <8 x i32> %135, i64 2, !dbg !41 + %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %603, i32 16, i32 31), !dbg !41 + %605 = fcmp ogt float %599, %602, !dbg !43 + %606 = fcmp oeq float %599, %602, !dbg !44 + %607 = fcmp uno float %602, 0.000000e+00, !dbg !46 + %608 = xor i1 %607, true, !dbg !47 + %609 = extractelement <8 x i1> %151, i64 2, !dbg !48 + %610 = and i1 %609, %608, !dbg !49 + %611 = or i1 %605, %610, !dbg !50 + %612 = and i1 %609, %607, !dbg !48 + %613 = or i1 %606, %612, !dbg !51 + %614 = icmp slt i32 %603, %604, !dbg !52 + %615 = and i1 %614, %613, !dbg !53 + %616 = or i1 %611, %615, !dbg !54 + %617 = select i1 %616, float %599, float %602, !dbg !55 + %618 = select i1 %616, i32 %603, i32 %604, !dbg !56 + %619 = bitcast float %617 to i32, !dbg !41 + %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 8, i32 31), !dbg !41 + %621 = bitcast i32 %620 to float, !dbg !41 + %622 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %618, i32 8, i32 31), !dbg !41 + %623 = fcmp ogt float %617, %621, !dbg !43 + %624 = fcmp oeq float %617, %621, !dbg !44 + %625 = fcmp uno float %617, 0.000000e+00, !dbg !45 + %626 = fcmp uno float %621, 0.000000e+00, !dbg !46 + %627 = xor i1 %626, true, !dbg !47 + %628 = and i1 %625, %627, !dbg !49 + %629 = or i1 %623, %628, !dbg !50 + %630 = and i1 %626, %625, !dbg !48 + %631 = or i1 %624, %630, !dbg !51 + %632 = icmp slt i32 %618, %622, !dbg !52 + %633 = and i1 %632, %631, !dbg !53 + %634 = or i1 %629, %633, !dbg !54 + %635 = select i1 %634, float %617, float %621, !dbg !55 + %636 = select i1 %634, i32 %618, i32 %622, !dbg !56 + %637 = bitcast float %635 to i32, !dbg !41 + %638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 4, i32 31), !dbg !41 + %639 = bitcast i32 %638 to float, !dbg !41 + %640 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %636, i32 4, i32 31), !dbg !41 + %641 = fcmp ogt float %635, %639, !dbg !43 + %642 = fcmp oeq float %635, %639, !dbg !44 + %643 = fcmp uno float %635, 0.000000e+00, !dbg !45 + %644 = fcmp uno float %639, 0.000000e+00, !dbg !46 + %645 = xor i1 %644, true, !dbg !47 + %646 = and i1 %643, %645, !dbg !49 + %647 = or i1 %641, %646, !dbg !50 + %648 = and i1 %644, %643, !dbg !48 + %649 = or i1 %642, %648, !dbg !51 + %650 = icmp slt i32 %636, %640, !dbg !52 + %651 = and i1 %650, %649, !dbg !53 + %652 = or i1 %647, %651, !dbg !54 + %653 = select i1 %652, float %635, float %639, !dbg !55 + %654 = select i1 %652, i32 %636, i32 %640, !dbg !56 + %655 = bitcast float %653 to i32, !dbg !41 + %656 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %655, i32 2, i32 31), !dbg !41 + %657 = bitcast i32 %656 to float, !dbg !41 + %658 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %654, i32 2, i32 31), !dbg !41 + %659 = fcmp ogt float %653, %657, !dbg !43 + %660 = fcmp oeq float %653, %657, !dbg !44 + %661 = fcmp uno float %653, 0.000000e+00, !dbg !45 + %662 = fcmp uno float %657, 0.000000e+00, !dbg !46 + %663 = xor i1 %662, true, !dbg !47 + %664 = and i1 %661, %663, !dbg !49 + %665 = or i1 %659, %664, !dbg !50 + %666 = and i1 %662, %661, !dbg !48 + %667 = or i1 %660, %666, !dbg !51 + %668 = icmp slt i32 %654, %658, !dbg !52 + %669 = and i1 %668, %667, !dbg !53 + %670 = or i1 %665, %669, !dbg !54 + %671 = select i1 %670, float %653, float %657, !dbg !55 + %672 = select i1 %670, i32 %654, i32 %658, !dbg !56 + %673 = bitcast float %671 to i32, !dbg !41 + %674 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %673, i32 1, i32 31), !dbg !41 + %675 = bitcast i32 %674 to float, !dbg !41 + %676 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %672, i32 1, i32 31), !dbg !41 + %677 = fcmp ogt float %671, %675, !dbg !43 + %678 = fcmp oeq float %671, %675, !dbg !44 + %679 = fcmp uno float %671, 0.000000e+00, !dbg !45 + %680 = fcmp uno float %675, 0.000000e+00, !dbg !46 + %681 = xor i1 %680, true, !dbg !47 + %682 = and i1 %679, %681, !dbg !49 + %683 = or i1 %677, %682, !dbg !50 + %684 = and i1 %680, %679, !dbg !48 + %685 = or i1 %678, %684, !dbg !51 + %686 = icmp slt i32 %672, %676, !dbg !52 + %687 = and i1 %686, %685, !dbg !53 + %688 = or i1 %683, %687, !dbg !54 + %689 = select i1 %688, i32 %672, i32 %676, !dbg !56 + %690 = extractelement <8 x float> %134, i64 1, !dbg !41 + %691 = bitcast float %690 to i32, !dbg !41 + %692 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %691, i32 16, i32 31), !dbg !41 + %693 = bitcast i32 %692 to float, !dbg !41 + %694 = extractelement <8 x i32> %135, i64 1, !dbg !41 + %695 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %694, i32 16, i32 31), !dbg !41 + %696 = fcmp ogt float %690, %693, !dbg !43 + %697 = fcmp oeq float %690, %693, !dbg !44 + %698 = fcmp uno float %693, 0.000000e+00, !dbg !46 + %699 = xor i1 %698, true, !dbg !47 + %700 = extractelement <8 x i1> %151, i64 1, !dbg !48 + %701 = and i1 %700, %699, !dbg !49 + %702 = or i1 %696, %701, !dbg !50 + %703 = and i1 %700, %698, !dbg !48 + %704 = or i1 %697, %703, !dbg !51 + %705 = icmp slt i32 %694, %695, !dbg !52 + %706 = and i1 %705, %704, !dbg !53 + %707 = or i1 %702, %706, !dbg !54 + %708 = select i1 %707, float %690, float %693, !dbg !55 + %709 = select i1 %707, i32 %694, i32 %695, !dbg !56 + %710 = bitcast float %708 to i32, !dbg !41 + %711 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %710, i32 8, i32 31), !dbg !41 + %712 = bitcast i32 %711 to float, !dbg !41 + %713 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %709, i32 8, i32 31), !dbg !41 + %714 = fcmp ogt float %708, %712, !dbg !43 + %715 = fcmp oeq float %708, %712, !dbg !44 + %716 = fcmp uno float %708, 0.000000e+00, !dbg !45 + %717 = fcmp uno float %712, 0.000000e+00, !dbg !46 + %718 = xor i1 %717, true, !dbg !47 + %719 = and i1 %716, %718, !dbg !49 + %720 = or i1 %714, %719, !dbg !50 + %721 = and i1 %717, %716, !dbg !48 + %722 = or i1 %715, %721, !dbg !51 + %723 = icmp slt i32 %709, %713, !dbg !52 + %724 = and i1 %723, %722, !dbg !53 + %725 = or i1 %720, %724, !dbg !54 + %726 = select i1 %725, float %708, float %712, !dbg !55 + %727 = select i1 %725, i32 %709, i32 %713, !dbg !56 + %728 = bitcast float %726 to i32, !dbg !41 + %729 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %728, i32 4, i32 31), !dbg !41 + %730 = bitcast i32 %729 to float, !dbg !41 + %731 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %727, i32 4, i32 31), !dbg !41 + %732 = fcmp ogt float %726, %730, !dbg !43 + %733 = fcmp oeq float %726, %730, !dbg !44 + %734 = fcmp uno float %726, 0.000000e+00, !dbg !45 + %735 = fcmp uno float %730, 0.000000e+00, !dbg !46 + %736 = xor i1 %735, true, !dbg !47 + %737 = and i1 %734, %736, !dbg !49 + %738 = or i1 %732, %737, !dbg !50 + %739 = and i1 %735, %734, !dbg !48 + %740 = or i1 %733, %739, !dbg !51 + %741 = icmp slt i32 %727, %731, !dbg !52 + %742 = and i1 %741, %740, !dbg !53 + %743 = or i1 %738, %742, !dbg !54 + %744 = select i1 %743, float %726, float %730, !dbg !55 + %745 = select i1 %743, i32 %727, i32 %731, !dbg !56 + %746 = bitcast float %744 to i32, !dbg !41 + %747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 2, i32 31), !dbg !41 + %748 = bitcast i32 %747 to float, !dbg !41 + %749 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %745, i32 2, i32 31), !dbg !41 + %750 = fcmp ogt float %744, %748, !dbg !43 + %751 = fcmp oeq float %744, %748, !dbg !44 + %752 = fcmp uno float %744, 0.000000e+00, !dbg !45 + %753 = fcmp uno float %748, 0.000000e+00, !dbg !46 + %754 = xor i1 %753, true, !dbg !47 + %755 = and i1 %752, %754, !dbg !49 + %756 = or i1 %750, %755, !dbg !50 + %757 = and i1 %753, %752, !dbg !48 + %758 = or i1 %751, %757, !dbg !51 + %759 = icmp slt i32 %745, %749, !dbg !52 + %760 = and i1 %759, %758, !dbg !53 + %761 = or i1 %756, %760, !dbg !54 + %762 = select i1 %761, float %744, float %748, !dbg !55 + %763 = select i1 %761, i32 %745, i32 %749, !dbg !56 + %764 = bitcast float %762 to i32, !dbg !41 + %765 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %764, i32 1, i32 31), !dbg !41 + %766 = bitcast i32 %765 to float, !dbg !41 + %767 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %763, i32 1, i32 31), !dbg !41 + %768 = fcmp ogt float %762, %766, !dbg !43 + %769 = fcmp oeq float %762, %766, !dbg !44 + %770 = fcmp uno float %762, 0.000000e+00, !dbg !45 + %771 = fcmp uno float %766, 0.000000e+00, !dbg !46 + %772 = xor i1 %771, true, !dbg !47 + %773 = and i1 %770, %772, !dbg !49 + %774 = or i1 %768, %773, !dbg !50 + %775 = and i1 %771, %770, !dbg !48 + %776 = or i1 %769, %775, !dbg !51 + %777 = icmp slt i32 %763, %767, !dbg !52 + %778 = and i1 %777, %776, !dbg !53 + %779 = or i1 %774, %778, !dbg !54 + %780 = select i1 %779, i32 %763, i32 %767, !dbg !56 + %781 = extractelement <8 x float> %134, i64 0, !dbg !41 + %782 = bitcast float %781 to i32, !dbg !41 + %783 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %782, i32 16, i32 31), !dbg !41 + %784 = bitcast i32 %783 to float, !dbg !41 + %785 = extractelement <8 x i32> %135, i64 0, !dbg !41 + %786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 16, i32 31), !dbg !41 + %787 = fcmp ogt float %781, %784, !dbg !43 + %788 = fcmp oeq float %781, %784, !dbg !44 + %789 = fcmp uno float %784, 0.000000e+00, !dbg !46 + %790 = xor i1 %789, true, !dbg !47 + %791 = extractelement <8 x i1> %151, i64 0, !dbg !48 + %792 = and i1 %791, %790, !dbg !49 + %793 = or i1 %787, %792, !dbg !50 + %794 = and i1 %791, %789, !dbg !48 + %795 = or i1 %788, %794, !dbg !51 + %796 = icmp slt i32 %785, %786, !dbg !52 + %797 = and i1 %796, %795, !dbg !53 + %798 = or i1 %793, %797, !dbg !54 + %799 = select i1 %798, float %781, float %784, !dbg !55 + %800 = select i1 %798, i32 %785, i32 %786, !dbg !56 + %801 = bitcast float %799 to i32, !dbg !41 + %802 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %801, i32 8, i32 31), !dbg !41 + %803 = bitcast i32 %802 to float, !dbg !41 + %804 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %800, i32 8, i32 31), !dbg !41 + %805 = fcmp ogt float %799, %803, !dbg !43 + %806 = fcmp oeq float %799, %803, !dbg !44 + %807 = fcmp uno float %799, 0.000000e+00, !dbg !45 + %808 = fcmp uno float %803, 0.000000e+00, !dbg !46 + %809 = xor i1 %808, true, !dbg !47 + %810 = and i1 %807, %809, !dbg !49 + %811 = or i1 %805, %810, !dbg !50 + %812 = and i1 %808, %807, !dbg !48 + %813 = or i1 %806, %812, !dbg !51 + %814 = icmp slt i32 %800, %804, !dbg !52 + %815 = and i1 %814, %813, !dbg !53 + %816 = or i1 %811, %815, !dbg !54 + %817 = select i1 %816, float %799, float %803, !dbg !55 + %818 = select i1 %816, i32 %800, i32 %804, !dbg !56 + %819 = bitcast float %817 to i32, !dbg !41 + %820 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %819, i32 4, i32 31), !dbg !41 + %821 = bitcast i32 %820 to float, !dbg !41 + %822 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %818, i32 4, i32 31), !dbg !41 + %823 = fcmp ogt float %817, %821, !dbg !43 + %824 = fcmp oeq float %817, %821, !dbg !44 + %825 = fcmp uno float %817, 0.000000e+00, !dbg !45 + %826 = fcmp uno float %821, 0.000000e+00, !dbg !46 + %827 = xor i1 %826, true, !dbg !47 + %828 = and i1 %825, %827, !dbg !49 + %829 = or i1 %823, %828, !dbg !50 + %830 = and i1 %826, %825, !dbg !48 + %831 = or i1 %824, %830, !dbg !51 + %832 = icmp slt i32 %818, %822, !dbg !52 + %833 = and i1 %832, %831, !dbg !53 + %834 = or i1 %829, %833, !dbg !54 + %835 = select i1 %834, float %817, float %821, !dbg !55 + %836 = select i1 %834, i32 %818, i32 %822, !dbg !56 + %837 = bitcast float %835 to i32, !dbg !41 + %838 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %837, i32 2, i32 31), !dbg !41 + %839 = bitcast i32 %838 to float, !dbg !41 + %840 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %836, i32 2, i32 31), !dbg !41 + %841 = fcmp ogt float %835, %839, !dbg !43 + %842 = fcmp oeq float %835, %839, !dbg !44 + %843 = fcmp uno float %835, 0.000000e+00, !dbg !45 + %844 = fcmp uno float %839, 0.000000e+00, !dbg !46 + %845 = xor i1 %844, true, !dbg !47 + %846 = and i1 %843, %845, !dbg !49 + %847 = or i1 %841, %846, !dbg !50 + %848 = and i1 %844, %843, !dbg !48 + %849 = or i1 %842, %848, !dbg !51 + %850 = icmp slt i32 %836, %840, !dbg !52 + %851 = and i1 %850, %849, !dbg !53 + %852 = or i1 %847, %851, !dbg !54 + %853 = select i1 %852, float %835, float %839, !dbg !55 + %854 = select i1 %852, i32 %836, i32 %840, !dbg !56 + %855 = bitcast float %853 to i32, !dbg !41 + %856 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %855, i32 1, i32 31), !dbg !41 + %857 = bitcast i32 %856 to float, !dbg !41 + %858 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %854, i32 1, i32 31), !dbg !41 + %859 = fcmp ogt float %853, %857, !dbg !43 + %860 = fcmp oeq float %853, %857, !dbg !44 + %861 = fcmp uno float %853, 0.000000e+00, !dbg !45 + %862 = fcmp uno float %857, 0.000000e+00, !dbg !46 + %863 = xor i1 %862, true, !dbg !47 + %864 = and i1 %861, %863, !dbg !49 + %865 = or i1 %859, %864, !dbg !50 + %866 = and i1 %862, %861, !dbg !48 + %867 = or i1 %860, %866, !dbg !51 + %868 = icmp slt i32 %854, %858, !dbg !52 + %869 = and i1 %868, %867, !dbg !53 + %870 = or i1 %865, %869, !dbg !54 + %871 = select i1 %870, i32 %854, i32 %858, !dbg !56 + %872 = and i32 %142, 15, !dbg !41 + %873 = icmp eq i32 %141, 0, !dbg !41 + %874 = getelementptr float, ptr addrspace(3) @global_smem, i32 %872, !dbg !41 + %875 = select i1 %233, i32 %218, i32 %219, !dbg !55 + %876 = insertelement <1 x i32> poison, i32 %875, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %874, <1 x i32> %876, i1 %873) #4, !dbg !41 + %877 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %872, !dbg !41 + %878 = insertelement <1 x i32> poison, i32 %234, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %877, <1 x i32> %878, i1 %873) #4, !dbg !41 + %879 = or i32 %142, 16, !dbg !41 + %880 = getelementptr float, ptr addrspace(3) @global_smem, i32 %879, !dbg !41 + %881 = select i1 %324, i32 %309, i32 %310, !dbg !55 + %882 = insertelement <1 x i32> poison, i32 %881, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %880, <1 x i32> %882, i1 %873) #4, !dbg !41 + %883 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %879, !dbg !41 + %884 = insertelement <1 x i32> poison, i32 %325, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %883, <1 x i32> %884, i1 %873) #4, !dbg !41 + %885 = or disjoint i32 %872, 32, !dbg !41 + %886 = getelementptr float, ptr addrspace(3) @global_smem, i32 %885, !dbg !41 + %887 = select i1 %415, i32 %400, i32 %401, !dbg !55 + %888 = insertelement <1 x i32> poison, i32 %887, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %886, <1 x i32> %888, i1 %873) #4, !dbg !41 + %889 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %885, !dbg !41 + %890 = insertelement <1 x i32> poison, i32 %416, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %889, <1 x i32> %890, i1 %873) #4, !dbg !41 + %891 = or i32 %142, 48, !dbg !41 + %892 = getelementptr float, ptr addrspace(3) @global_smem, i32 %891, !dbg !41 + %893 = select i1 %506, i32 %491, i32 %492, !dbg !55 + %894 = insertelement <1 x i32> poison, i32 %893, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %892, <1 x i32> %894, i1 %873) #4, !dbg !41 + %895 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %891, !dbg !41 + %896 = insertelement <1 x i32> poison, i32 %507, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %895, <1 x i32> %896, i1 %873) #4, !dbg !41 + %897 = or disjoint i32 %872, 64, !dbg !41 + %898 = getelementptr float, ptr addrspace(3) @global_smem, i32 %897, !dbg !41 + %899 = select i1 %597, i32 %582, i32 %583, !dbg !55 + %900 = insertelement <1 x i32> poison, i32 %899, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %898, <1 x i32> %900, i1 %873) #4, !dbg !41 + %901 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %897, !dbg !41 + %902 = insertelement <1 x i32> poison, i32 %598, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %901, <1 x i32> %902, i1 %873) #4, !dbg !41 + %903 = or i32 %142, 80, !dbg !41 + %904 = getelementptr float, ptr addrspace(3) @global_smem, i32 %903, !dbg !41 + %905 = select i1 %688, i32 %673, i32 %674, !dbg !55 + %906 = insertelement <1 x i32> poison, i32 %905, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %904, <1 x i32> %906, i1 %873) #4, !dbg !41 + %907 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %903, !dbg !41 + %908 = insertelement <1 x i32> poison, i32 %689, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %907, <1 x i32> %908, i1 %873) #4, !dbg !41 + %909 = or disjoint i32 %872, 96, !dbg !41 + %910 = getelementptr float, ptr addrspace(3) @global_smem, i32 %909, !dbg !41 + %911 = select i1 %779, i32 %764, i32 %765, !dbg !55 + %912 = insertelement <1 x i32> poison, i32 %911, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %910, <1 x i32> %912, i1 %873) #4, !dbg !41 + %913 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %909, !dbg !41 + %914 = insertelement <1 x i32> poison, i32 %780, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %913, <1 x i32> %914, i1 %873) #4, !dbg !41 + %915 = or i32 %142, 112, !dbg !41 + %916 = getelementptr float, ptr addrspace(3) @global_smem, i32 %915, !dbg !41 + %917 = select i1 %870, i32 %855, i32 %856, !dbg !55 + %918 = insertelement <1 x i32> poison, i32 %917, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %916, <1 x i32> %918, i1 %873) #4, !dbg !41 + %919 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %915, !dbg !41 + %920 = insertelement <1 x i32> poison, i32 %871, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %919, <1 x i32> %920, i1 %873) #4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %921 = icmp samesign ult i32 %10, 128, !dbg !41 + %922 = getelementptr float, ptr addrspace(3) @global_smem, i32 %10, !dbg !41 + %923 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %922, i1 %921) #4, !dbg !41 + %924 = bitcast i32 %923 to float, !dbg !41 + %925 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %10, !dbg !41 + %926 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %925, i1 %921) #4, !dbg !41 + %927 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %923, i32 8, i32 31), !dbg !41 + %928 = bitcast i32 %927 to float, !dbg !41 + %929 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %926, i32 8, i32 31), !dbg !41 + %930 = fcmp ogt float %924, %928, !dbg !43 + %931 = fcmp oeq float %924, %928, !dbg !44 + %932 = fcmp uno float %924, 0.000000e+00, !dbg !45 + %933 = fcmp uno float %928, 0.000000e+00, !dbg !46 + %934 = xor i1 %933, true, !dbg !47 + %935 = and i1 %932, %934, !dbg !49 + %936 = or i1 %930, %935, !dbg !50 + %937 = and i1 %932, %933, !dbg !48 + %938 = or i1 %931, %937, !dbg !51 + %939 = icmp slt i32 %926, %929, !dbg !52 + %940 = and i1 %939, %938, !dbg !53 + %941 = or i1 %936, %940, !dbg !54 + %942 = select i1 %941, float %924, float %928, !dbg !55 + %943 = select i1 %941, i32 %926, i32 %929, !dbg !56 + %944 = bitcast float %942 to i32, !dbg !41 + %945 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %944, i32 4, i32 31), !dbg !41 + %946 = bitcast i32 %945 to float, !dbg !41 + %947 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 4, i32 31), !dbg !41 + %948 = fcmp ogt float %942, %946, !dbg !43 + %949 = fcmp oeq float %942, %946, !dbg !44 + %950 = fcmp uno float %942, 0.000000e+00, !dbg !45 + %951 = fcmp uno float %946, 0.000000e+00, !dbg !46 + %952 = xor i1 %951, true, !dbg !47 + %953 = and i1 %950, %952, !dbg !49 + %954 = or i1 %948, %953, !dbg !50 + %955 = and i1 %951, %950, !dbg !48 + %956 = or i1 %949, %955, !dbg !51 + %957 = icmp slt i32 %943, %947, !dbg !52 + %958 = and i1 %957, %956, !dbg !53 + %959 = or i1 %954, %958, !dbg !54 + %960 = select i1 %959, float %942, float %946, !dbg !55 + %961 = select i1 %959, i32 %943, i32 %947, !dbg !56 + %962 = bitcast float %960 to i32, !dbg !41 + %963 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %962, i32 2, i32 31), !dbg !41 + %964 = bitcast i32 %963 to float, !dbg !41 + %965 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 2, i32 31), !dbg !41 + %966 = fcmp ogt float %960, %964, !dbg !43 + %967 = fcmp oeq float %960, %964, !dbg !44 + %968 = fcmp uno float %960, 0.000000e+00, !dbg !45 + %969 = fcmp uno float %964, 0.000000e+00, !dbg !46 + %970 = xor i1 %969, true, !dbg !47 + %971 = and i1 %968, %970, !dbg !49 + %972 = or i1 %966, %971, !dbg !50 + %973 = and i1 %969, %968, !dbg !48 + %974 = or i1 %967, %973, !dbg !51 + %975 = icmp slt i32 %961, %965, !dbg !52 + %976 = and i1 %975, %974, !dbg !53 + %977 = or i1 %972, %976, !dbg !54 + %978 = select i1 %977, float %960, float %964, !dbg !55 + %979 = select i1 %977, i32 %961, i32 %965, !dbg !56 + %980 = bitcast float %978 to i32, !dbg !41 + %981 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %980, i32 1, i32 31), !dbg !41 + %982 = bitcast i32 %981 to float, !dbg !41 + %983 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %979, i32 1, i32 31), !dbg !41 + %984 = fcmp ogt float %978, %982, !dbg !43 + %985 = fcmp oeq float %978, %982, !dbg !44 + %986 = fcmp uno float %978, 0.000000e+00, !dbg !45 + %987 = fcmp uno float %982, 0.000000e+00, !dbg !46 + %988 = xor i1 %987, true, !dbg !47 + %989 = and i1 %986, %988, !dbg !49 + %990 = or i1 %984, %989, !dbg !50 + %991 = and i1 %987, %986, !dbg !48 + %992 = or i1 %985, %991, !dbg !51 + %993 = icmp slt i32 %979, %983, !dbg !52 + %994 = and i1 %993, %992, !dbg !53 + %995 = or i1 %990, %994, !dbg !54 + %996 = select i1 %995, i32 %979, i32 %983, !dbg !56 + %997 = and i32 %10, 911, !dbg !41 + %998 = icmp eq i32 %997, 0, !dbg !41 + %999 = select i1 %995, i32 %980, i32 %981, !dbg !55 + %1000 = insertelement <1 x i32> poison, i32 %999, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %922, <1 x i32> %1000, i1 %998) #4, !dbg !41 + %1001 = insertelement <1 x i32> poison, i32 %996, i64 0, !dbg !41 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %925, <1 x i32> %1001, i1 %998) #4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %1002 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), align 16, !dbg !41 + %1003 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 576), align 16, !dbg !41 + %1004 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 640), align 16, !dbg !41 + %1005 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 704), align 16, !dbg !41 + %1006 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 768), align 16, !dbg !41 + %1007 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 832), align 16, !dbg !41 + %1008 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 896), align 16, !dbg !41 + %1009 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 960), align 16, !dbg !41 + %1010 = sext i32 %139 to i64, !dbg !57 + %1011 = getelementptr i64, ptr addrspace(1) %1, i64 %1010, !dbg !57 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %1012 = insertelement <4 x i32> poison, i32 %1002, i64 0, !dbg !58 + %1013 = insertelement <4 x i32> %1012, i32 %1003, i64 1, !dbg !58 + %1014 = insertelement <4 x i32> %1013, i32 %1004, i64 2, !dbg !58 + %1015 = insertelement <4 x i32> %1014, i32 %1005, i64 3, !dbg !58 + store <4 x i32> %1015, ptr addrspace(3) @global_smem, align 16, !dbg !58 + %1016 = insertelement <4 x i32> poison, i32 %1006, i64 0, !dbg !58 + %1017 = insertelement <4 x i32> %1016, i32 %1007, i64 1, !dbg !58 + %1018 = insertelement <4 x i32> %1017, i32 %1008, i64 2, !dbg !58 + %1019 = insertelement <4 x i32> %1018, i32 %1009, i64 3, !dbg !58 + store <4 x i32> %1019, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global_smem, i32 16), align 16, !dbg !58 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %1020 = shl nuw nsw i32 %10, 4, !dbg !58 + %1021 = and i32 %1020, 16, !dbg !58 + %1022 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1021, !dbg !58 + %1023 = ptrtoint ptr addrspace(3) %1022 to i32, !dbg !58 + %1024 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %1023) #4, !dbg !58 + %1025 = sext i32 %1024 to i64, !dbg !58 + %1026 = and i32 %10, 504, !dbg !58 + %1027 = icmp eq i32 %1026, 0, !dbg !58 + %1028 = and i1 %1027, %140, !dbg !58 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %1025, ptr addrspace(1) %1011, i1 %1028) #4, !dbg !58 + ret void, !dbg !59 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 44, scope: !4) +!9 = !DILocation(line: 22, column: 33, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 19, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 32, column: 40, scope: !4) +!17 = !DILocation(line: 33, column: 31, scope: !4) +!18 = !DILocation(line: 34, column: 29, scope: !4) +!19 = !DILocation(line: 38, column: 34, scope: !4) +!20 = !DILocation(line: 38, column: 61, scope: !4) +!21 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!24 = !DILocation(line: 41, column: 38, scope: !4) +!25 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !24) +!26 = !DILocation(line: 38, column: 71, scope: !4) +!27 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !24) +!28 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !24) +!29 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !24) +!30 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !24) +!31 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !24) +!32 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !24) +!33 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !24) +!34 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !24) +!35 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !24) +!36 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !24) +!37 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !24) +!38 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !24) +!39 = !DILocation(line: 43, column: 54, scope: !4) +!40 = !DILocation(line: 44, column: 66, scope: !4) +!41 = !DILocation(line: 165, column: 42, scope: !22, inlinedAt: !42) +!42 = !DILocation(line: 45, column: 75, scope: !4) +!43 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !42) +!44 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !42) +!45 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !42) +!46 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !42) +!47 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !42) +!48 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !42) +!49 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !42) +!50 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !42) +!51 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !42) +!52 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !42) +!53 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !42) +!54 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !42) +!55 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !42) +!56 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !42) +!57 = !DILocation(line: 47, column: 25, scope: !4) +!58 = !DILocation(line: 47, column: 36, scope: !4) +!59 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e2a5657a39732d09f26b7fc9236ab4c7c20b0ad6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ptx @@ -0,0 +1,2258 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<674>; + .reg .b32 %r<370>; + .reg .b64 %rd<206>; +$L__func_begin0: + +// %bb.0: + .loc 1 22 28 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:28 + mov.u32 %r36, %ctaid.x; + ld.param.b64 %rd81, [triton_red_fused_argmax_1_param_2]; + .loc 1 22 33 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:33 + shl.b32 %r9, %r36, 3; + .loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23 + or.b32 %r8, %r9, 1; + or.b32 %r7, %r9, 2; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd1, %r9; + cvt.s64.s32 %rd2, %r8; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd82, %rd1, %rd81; + and.b64 %rd83, %rd82, -4294967296; + setp.ne.b64 %p1, %rd83, 0; + cvt.u32.u64 %r361, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd185, %rd1, %rd81; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r37, %rd81; + div.u32 %r39, %r361, %r37; + cvt.u64.u32 %rd185, %r39; +$L__BB0_3: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r6, %r9, 3; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + cvt.s64.s32 %rd3, %r7; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd85, %rd2, %rd81; + and.b64 %rd86, %rd85, -4294967296; + setp.ne.b64 %p2, %rd86, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd186, %rd2, %rd81; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r40, %rd81; + cvt.u32.u64 %r41, %rd2; + div.u32 %r42, %r41, %r40; + cvt.u64.u32 %rd186, %r42; +$L__BB0_6: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r5, %r9, 4; + cvt.s64.s32 %rd4, %r6; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd87, %rd3, %rd81; + and.b64 %rd88, %rd87, -4294967296; + setp.ne.b64 %p3, %rd88, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd187, %rd3, %rd81; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r43, %rd81; + cvt.u32.u64 %r44, %rd3; + div.u32 %r45, %r44, %r43; + cvt.u64.u32 %rd187, %r45; +$L__BB0_9: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r4, %r9, 5; + cvt.s64.s32 %rd5, %r5; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd89, %rd4, %rd81; + and.b64 %rd90, %rd89, -4294967296; + setp.ne.b64 %p4, %rd90, 0; + @%p4 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + div.s64 %rd188, %rd4, %rd81; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r46, %rd81; + cvt.u32.u64 %r47, %rd4; + div.u32 %r48, %r47, %r46; + cvt.u64.u32 %rd188, %r48; +$L__BB0_12: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r3, %r9, 6; + cvt.s64.s32 %rd6, %r4; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd91, %rd5, %rd81; + and.b64 %rd92, %rd91, -4294967296; + setp.ne.b64 %p5, %rd92, 0; + @%p5 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd189, %rd5, %rd81; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r49, %rd81; + cvt.u32.u64 %r50, %rd5; + div.u32 %r51, %r50, %r49; + cvt.u64.u32 %rd189, %r51; +$L__BB0_15: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + or.b32 %r2, %r9, 7; + cvt.s64.s32 %rd7, %r3; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd94, %rd6, %rd81; + and.b64 %rd95, %rd94, -4294967296; + setp.ne.b64 %p6, %rd95, 0; + @%p6 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd190, %rd6, %rd81; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r52, %rd81; + cvt.u32.u64 %r53, %rd6; + div.u32 %r54, %r53, %r52; + cvt.u64.u32 %rd190, %r54; +$L__BB0_18: + .loc 1 0 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0 + mov.u32 %r1, %tid.x; + mul.lo.s64 %rd84, %rd185, %rd81; + mul.lo.s64 %rd93, %rd189, %rd81; + cvt.s64.s32 %rd8, %r2; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + mul.lo.s64 %rd96, %rd190, %rd81; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd97, %rd7, %rd81; + and.b64 %rd98, %rd97, -4294967296; + setp.ne.b64 %p7, %rd98, 0; + @%p7 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + div.s64 %rd191, %rd7, %rd81; + bra.uni $L__BB0_21; +$L__BB0_19: + cvt.u32.u64 %r55, %rd81; + cvt.u32.u64 %r56, %rd7; + div.u32 %r57, %r56, %r55; + cvt.u64.u32 %rd191, %r57; +$L__BB0_21: + .loc 1 0 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0:19 + ld.param.b32 %r35, [triton_red_fused_argmax_1_param_4]; + ld.param.b64 %rd80, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd78, [triton_red_fused_argmax_1_param_0]; + and.b32 %r18, %r1, 511; + sub.s64 %rd13, %rd1, %rd84; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + sub.s64 %rd27, %rd5, %rd93; + sub.s64 %rd32, %rd6, %rd96; + mul.lo.s64 %rd99, %rd191, %rd81; + sub.s64 %rd37, %rd7, %rd99; + .loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19 + or.b64 %rd100, %rd8, %rd81; + and.b64 %rd101, %rd100, -4294967296; + setp.ne.b64 %p8, %rd101, 0; + @%p8 bra $L__BB0_23; + bra.uni $L__BB0_22; +$L__BB0_23: + div.s64 %rd192, %rd8, %rd81; + bra.uni $L__BB0_24; +$L__BB0_22: + cvt.u32.u64 %r58, %rd81; + cvt.u32.u64 %r59, %rd8; + div.u32 %r60, %r59, %r58; + cvt.u64.u32 %rd192, %r60; +$L__BB0_24: + .loc 1 0 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0:19 + ld.param.b64 %rd79, [triton_red_fused_argmax_1_param_1]; + .loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19 + mul.lo.s64 %rd107, %rd192, %rd81; + sub.s64 %rd108, %rd8, %rd107; + .loc 1 38 56 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:56 + mul.lo.s64 %rd109, %rd185, %rd80; + mul.lo.s64 %rd110, %rd189, %rd80; + mul.lo.s64 %rd111, %rd190, %rd80; + mul.lo.s64 %rd112, %rd191, %rd80; + mul.lo.s64 %rd113, %rd192, %rd80; + mad.lo.s64 %rd114, %rd13, 128000, %rd78; + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + shl.b64 %rd115, %rd109, 2; + add.s64 %rd193, %rd114, %rd115; + mad.lo.s64 %rd116, %rd27, 128000, %rd78; + shl.b64 %rd117, %rd110, 2; + add.s64 %rd197, %rd116, %rd117; + mad.lo.s64 %rd118, %rd32, 128000, %rd78; + shl.b64 %rd119, %rd111, 2; + add.s64 %rd198, %rd118, %rd119; + mad.lo.s64 %rd120, %rd37, 128000, %rd78; + shl.b64 %rd121, %rd112, 2; + add.s64 %rd199, %rd120, %rd121; + mad.lo.s64 %rd122, %rd108, 128000, %rd78; + shl.b64 %rd123, %rd113, 2; + add.s64 %rd200, %rd122, %rd123; + cvt.u64.u32 %rd47, %r18; + mul.wide.u32 %rd48, %r18, 4; + shl.b64 %rd124, %rd80, 2; + mul.lo.s64 %rd125, %rd81, 128000; + sub.s64 %rd126, %rd124, %rd125; + mul.lo.s64 %rd127, %rd1, 128000; + mad.lo.s64 %rd128, %rd188, %rd126, %rd127; + add.s64 %rd129, %rd128, %rd78; + add.s64 %rd196, %rd129, 384000; + mad.lo.s64 %rd130, %rd187, %rd126, %rd127; + add.s64 %rd131, %rd130, %rd78; + add.s64 %rd195, %rd131, 256000; + mad.lo.s64 %rd132, %rd186, %rd126, %rd127; + add.s64 %rd133, %rd132, %rd78; + add.s64 %rd194, %rd133, 128000; + mov.b32 %r69, 0fFF800000; + mov.b64 %rd202, {%r69, %r69}; + mov.b32 %r362, 2147483647; + mov.b64 %rd201, 0; + setp.lt.s32 %p17, %r9, %r35; + setp.lt.s32 %p18, %r8, %r35; + setp.lt.s32 %p19, %r7, %r35; + setp.lt.s32 %p20, %r6, %r35; + mov.b64 %rd203, %rd202; + mov.b64 %rd204, %rd202; + mov.b64 %rd205, %rd202; + mov.b32 %r363, %r362; + mov.b32 %r364, %r362; + mov.b32 %r365, %r362; + mov.b32 %r366, %r362; + mov.b32 %r367, %r362; + mov.b32 %r368, %r362; + mov.b32 %r369, %r362; +$L__BB0_25: // =>This Inner Loop Header: Depth=1 + .loc 1 24 21 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:24:21 + setp.lt.s32 %p21, %r5, %r35; + setp.lt.s32 %p22, %r4, %r35; + setp.lt.s32 %p23, %r3, %r35; + setp.lt.s32 %p24, %r2, %r35; + .loc 1 34 29 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:34:29 + add.s64 %rd158, %rd47, %rd201; + setp.lt.u64 %p25, %rd158, 32000; + .loc 1 38 34 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:34 + add.s64 %rd135, %rd193, %rd48; + add.s64 %rd138, %rd194, %rd48; + add.s64 %rd141, %rd195, %rd48; + add.s64 %rd144, %rd196, %rd48; + add.s64 %rd147, %rd197, %rd48; + add.s64 %rd150, %rd198, %rd48; + add.s64 %rd153, %rd199, %rd48; + .loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61 + add.s64 %rd156, %rd200, %rd48; + // begin inline asm + mov.u64 %rd134, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd134, 1.0; + // end inline asm +$L__tmp0: + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r86, %r87}, %rd202; + setp.nan.f32 %p26, %r86, %r86; + setp.nan.f32 %p27, %r87, %r87; + mov.b64 {%r88, %r89}, %rd203; + setp.nan.f32 %p28, %r88, %r88; + setp.nan.f32 %p29, %r89, %r89; + mov.b64 {%r90, %r91}, %rd204; + setp.nan.f32 %p30, %r90, %r90; + setp.nan.f32 %p31, %r91, %r91; + mov.b64 {%r92, %r93}, %rd205; + setp.nan.f32 %p32, %r92, %r92; + setp.nan.f32 %p33, %r93, %r93; +$L__tmp1: + .loc 1 38 71 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:71 + and.pred %p16, %p24, %p25; + and.pred %p15, %p23, %p25; + and.pred %p14, %p22, %p25; + and.pred %p13, %p21, %p25; + and.pred %p12, %p20, %p25; + and.pred %p11, %p19, %p25; + and.pred %p10, %p18, %p25; + and.pred %p9, %p17, %p25; + mov.b32 %r71, 0; + .loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61 + // begin inline asm + mov.u32 %r70, %r71; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b32 { %r70 }, [ %rd135 + 0 ], %rd134; + // end inline asm + // begin inline asm + mov.u64 %rd137, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd137, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r72, %r71; + @%p10 ld.global.L1::evict_first.L2::cache_hint.b32 { %r72 }, [ %rd138 + 0 ], %rd137; + // end inline asm + // begin inline asm + mov.u64 %rd140, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd140, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r74, %r71; + @%p11 ld.global.L1::evict_first.L2::cache_hint.b32 { %r74 }, [ %rd141 + 0 ], %rd140; + // end inline asm + // begin inline asm + mov.u64 %rd143, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd143, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r76, %r71; + @%p12 ld.global.L1::evict_first.L2::cache_hint.b32 { %r76 }, [ %rd144 + 0 ], %rd143; + // end inline asm + // begin inline asm + mov.u64 %rd146, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd146, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r78, %r71; + @%p13 ld.global.L1::evict_first.L2::cache_hint.b32 { %r78 }, [ %rd147 + 0 ], %rd146; + // end inline asm + // begin inline asm + mov.u64 %rd149, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd149, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r80, %r71; + @%p14 ld.global.L1::evict_first.L2::cache_hint.b32 { %r80 }, [ %rd150 + 0 ], %rd149; + // end inline asm + // begin inline asm + mov.u64 %rd152, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd152, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r82, %r71; + @%p15 ld.global.L1::evict_first.L2::cache_hint.b32 { %r82 }, [ %rd153 + 0 ], %rd152; + // end inline asm + // begin inline asm + mov.u64 %rd155, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd155, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r84, %r71; + @%p16 ld.global.L1::evict_first.L2::cache_hint.b32 { %r84 }, [ %rd156 + 0 ], %rd155; + // end inline asm + cvt.u64.u32 %rd159, %r82; + shl.b64 %rd160, %rd159, 32; + cvt.u64.u32 %rd161, %r84; + or.b64 %rd162, %rd161, %rd160; + cvt.u64.u32 %rd163, %r78; + shl.b64 %rd164, %rd163, 32; + cvt.u64.u32 %rd165, %r80; + or.b64 %rd166, %rd165, %rd164; + cvt.u64.u32 %rd167, %r74; + shl.b64 %rd168, %rd167, 32; + cvt.u64.u32 %rd169, %r76; + or.b64 %rd170, %rd169, %rd168; + cvt.u64.u32 %rd171, %r70; + shl.b64 %rd172, %rd171, 32; + cvt.u64.u32 %rd173, %r72; + or.b64 %rd174, %rd173, %rd172; +$L__tmp2: + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r94, %r95}, %rd174; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p34, %r93, %r95; + setp.gt.f32 %p35, %r92, %r94; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r96, %r97}, %rd170; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p36, %r91, %r97; + setp.gt.f32 %p37, %r90, %r96; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r98, %r99}, %rd166; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p38, %r89, %r99; + setp.gt.f32 %p39, %r88, %r98; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + mov.b64 {%r100, %r101}, %rd162; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.gt.f32 %p40, %r87, %r101; + setp.gt.f32 %p41, %r86, %r100; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.eq.f32 %p42, %r86, %r100; + setp.eq.f32 %p43, %r87, %r101; + setp.eq.f32 %p44, %r88, %r98; + setp.eq.f32 %p45, %r89, %r99; + setp.eq.f32 %p46, %r90, %r96; + setp.eq.f32 %p47, %r91, %r97; + setp.eq.f32 %p48, %r92, %r94; + setp.eq.f32 %p49, %r93, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + setp.nan.f32 %p50, %r95, %r95; + setp.nan.f32 %p51, %r94, %r94; + setp.nan.f32 %p52, %r97, %r97; + setp.nan.f32 %p53, %r96, %r96; + setp.nan.f32 %p54, %r99, %r99; + setp.nan.f32 %p55, %r98, %r98; + setp.nan.f32 %p56, %r101, %r101; + setp.nan.f32 %p57, %r100, %r100; + setp.num.f32 %p58, %r100, %r100; + setp.num.f32 %p59, %r101, %r101; + setp.num.f32 %p60, %r98, %r98; + setp.num.f32 %p61, %r99, %r99; + setp.num.f32 %p62, %r96, %r96; + setp.num.f32 %p63, %r97, %r97; + setp.num.f32 %p64, %r94, %r94; + setp.num.f32 %p65, %r95, %r95; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p66, %p33, %p65; + and.pred %p67, %p32, %p64; + and.pred %p68, %p31, %p63; + and.pred %p69, %p30, %p62; + and.pred %p70, %p29, %p61; + and.pred %p71, %p28, %p60; + and.pred %p72, %p27, %p59; + and.pred %p73, %p26, %p58; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p74, %p41, %p73; + or.pred %p75, %p40, %p72; + or.pred %p76, %p39, %p71; + or.pred %p77, %p38, %p70; + or.pred %p78, %p37, %p69; + or.pred %p79, %p36, %p68; + or.pred %p80, %p35, %p67; + or.pred %p81, %p34, %p66; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p82, %p26, %p57; + and.pred %p83, %p27, %p56; + and.pred %p84, %p28, %p55; + and.pred %p85, %p29, %p54; + and.pred %p86, %p30, %p53; + and.pred %p87, %p31, %p52; + and.pred %p88, %p32, %p51; + and.pred %p89, %p33, %p50; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p90, %p49, %p89; + or.pred %p91, %p48, %p88; + or.pred %p92, %p47, %p87; + or.pred %p93, %p46, %p86; + or.pred %p94, %p45, %p85; + or.pred %p95, %p44, %p84; + or.pred %p96, %p43, %p83; + or.pred %p97, %p42, %p82; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + cvt.s64.s32 %rd175, %r362; + cvt.s64.s32 %rd176, %r363; + cvt.s64.s32 %rd177, %r364; + cvt.s64.s32 %rd178, %r365; + cvt.s64.s32 %rd179, %r366; + cvt.s64.s32 %rd180, %r367; + cvt.s64.s32 %rd181, %r368; + cvt.s64.s32 %rd182, %r369; + setp.gt.s64 %p98, %rd158, %rd182; + setp.gt.s64 %p99, %rd158, %rd181; + setp.gt.s64 %p100, %rd158, %rd180; + setp.gt.s64 %p101, %rd158, %rd179; + setp.gt.s64 %p102, %rd158, %rd178; + setp.gt.s64 %p103, %rd158, %rd177; + setp.gt.s64 %p104, %rd158, %rd176; + setp.gt.s64 %p105, %rd158, %rd175; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + and.pred %p106, %p105, %p97; + and.pred %p107, %p104, %p96; + and.pred %p108, %p103, %p95; + and.pred %p109, %p102, %p94; + and.pred %p110, %p101, %p93; + and.pred %p111, %p100, %p92; + and.pred %p112, %p99, %p91; + and.pred %p113, %p98, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + or.pred %p114, %p81, %p113; + or.pred %p115, %p80, %p112; + or.pred %p116, %p79, %p111; + or.pred %p117, %p78, %p110; + or.pred %p118, %p77, %p109; + or.pred %p119, %p76, %p108; + or.pred %p120, %p75, %p107; + or.pred %p121, %p74, %p106; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + selp.f32 %r102, %r86, %r100, %p121; + selp.f32 %r103, %r87, %r101, %p120; + selp.f32 %r104, %r88, %r98, %p119; + selp.f32 %r105, %r89, %r99, %p118; + selp.f32 %r106, %r90, %r96, %p117; + selp.f32 %r107, %r91, %r97, %p116; + selp.f32 %r108, %r92, %r94, %p115; + selp.f32 %r109, %r93, %r95, %p114; + cvt.u32.u64 %r110, %rd158; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ] + selp.b32 %r111, %r362, %r110, %p121; + selp.b32 %r112, %r363, %r110, %p120; + selp.b32 %r113, %r364, %r110, %p119; + selp.b32 %r114, %r365, %r110, %p118; + selp.b32 %r115, %r366, %r110, %p117; + selp.b32 %r116, %r367, %r110, %p116; + selp.b32 %r117, %r368, %r110, %p115; + selp.b32 %r118, %r369, %r110, %p114; +$L__tmp3: + .loc 1 43 54 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:43:54 + selp.f32 %r119, %r109, %r93, %p9; + selp.f32 %r120, %r108, %r92, %p10; + mov.b64 %rd205, {%r120, %r119}; + selp.f32 %r121, %r107, %r91, %p11; + selp.f32 %r122, %r106, %r90, %p12; + mov.b64 %rd204, {%r122, %r121}; + selp.f32 %r123, %r105, %r89, %p13; + selp.f32 %r124, %r104, %r88, %p14; + mov.b64 %rd203, {%r124, %r123}; + selp.f32 %r125, %r103, %r87, %p15; + selp.f32 %r126, %r102, %r86, %p16; + mov.b64 %rd202, {%r126, %r125}; + .loc 1 44 66 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:44:66 + selp.b32 %r369, %r118, %r369, %p9; + selp.b32 %r368, %r117, %r368, %p10; + selp.b32 %r367, %r116, %r367, %p11; + selp.b32 %r366, %r115, %r366, %p12; + selp.b32 %r365, %r114, %r365, %p13; + selp.b32 %r364, %r113, %r364, %p14; + selp.b32 %r363, %r112, %r363, %p15; + selp.b32 %r362, %r111, %r362, %p16; + .loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40 + add.s64 %rd69, %rd201, 512; + add.s64 %rd200, %rd200, 2048; + add.s64 %rd199, %rd199, 2048; + add.s64 %rd198, %rd198, 2048; + add.s64 %rd197, %rd197, 2048; + add.s64 %rd196, %rd196, 2048; + add.s64 %rd195, %rd195, 2048; + add.s64 %rd194, %rd194, 2048; + add.s64 %rd193, %rd193, 2048; + setp.lt.u64 %p122, %rd201, 31488; + mov.b64 %rd201, %rd69; + @%p122 bra $L__BB0_25; +// %bb.26: + .loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44 + and.b32 %r170, %r1, 7; + .loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23 + or.b32 %r171, %r361, %r170; + .loc 1 24 21 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:24:21 + setp.lt.s32 %p144, %r171, %r35; + .loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44 + and.b32 %r172, %r1, 31; +$L__tmp4: + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + mov.b64 {%r173, %r174}, %rd205; + shfl.sync.bfly.b32 %r175, %r174, 16, 31, -1; + shfl.sync.bfly.b32 %r176, %r369, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p145, %r174, %r175; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p146, %r174, %r175; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + mov.b64 {%r177, %r178}, %rd202; + setp.nan.f32 %p147, %r177, %r177; + setp.nan.f32 %p148, %r178, %r178; + mov.b64 {%r179, %r180}, %rd203; + setp.nan.f32 %p149, %r179, %r179; + setp.nan.f32 %p150, %r180, %r180; + mov.b64 {%r181, %r182}, %rd204; + setp.nan.f32 %p151, %r181, %r181; + setp.nan.f32 %p152, %r182, %r182; + setp.nan.f32 %p153, %r173, %r173; + setp.nan.f32 %p154, %r174, %r174; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p155, %r175, %r175; + setp.num.f32 %p156, %r175, %r175; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p157, %p154, %p156; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p158, %p145, %p157; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p159, %p154, %p155; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p160, %p146, %p159; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p161, %r369, %r176; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p162, %p161, %p160; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p163, %p158, %p162; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r183, %r174, %r175, %p163; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r184, %r369, %r176, %p163; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r185, %r183, 8, 31, -1; + shfl.sync.bfly.b32 %r186, %r184, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p164, %r183, %r185; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p165, %r183, %r185; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p166, %r183, %r183; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p167, %r185, %r185; + setp.num.f32 %p168, %r185, %r185; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p169, %p166, %p168; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p170, %p164, %p169; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p171, %p167, %p166; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p172, %p165, %p171; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p173, %r184, %r186; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p174, %p173, %p172; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p175, %p170, %p174; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r187, %r183, %r185, %p175; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r188, %r184, %r186, %p175; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r189, %r187, 4, 31, -1; + shfl.sync.bfly.b32 %r190, %r188, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p176, %r187, %r189; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p177, %r187, %r189; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p178, %r187, %r187; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p179, %r189, %r189; + setp.num.f32 %p180, %r189, %r189; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p181, %p178, %p180; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p182, %p176, %p181; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p183, %p179, %p178; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p184, %p177, %p183; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p185, %r188, %r190; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p186, %p185, %p184; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p187, %p182, %p186; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r191, %r187, %r189, %p187; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r192, %r188, %r190, %p187; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r193, %r191, 2, 31, -1; + shfl.sync.bfly.b32 %r194, %r192, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p188, %r191, %r193; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p189, %r191, %r193; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p190, %r191, %r191; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p191, %r193, %r193; + setp.num.f32 %p192, %r193, %r193; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p193, %p190, %p192; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p194, %p188, %p193; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p195, %p191, %p190; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p196, %p189, %p195; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p197, %r192, %r194; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p198, %p197, %p196; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p199, %p194, %p198; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r195, %r191, %r193, %p199; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r196, %r192, %r194, %p199; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r197, %r195, 1, 31, -1; + shfl.sync.bfly.b32 %r198, %r196, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p200, %r195, %r197; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p201, %r195, %r197; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p202, %r195, %r195; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p203, %r197, %r197; + setp.num.f32 %p204, %r197, %r197; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p205, %p202, %p204; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p206, %p200, %p205; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p207, %p203, %p202; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p208, %p201, %p207; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p209, %r196, %r198; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p210, %p209, %p208; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p211, %p206, %p210; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r130, %r196, %r198, %p211; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r199, %r173, 16, 31, -1; + shfl.sync.bfly.b32 %r200, %r368, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p212, %r173, %r199; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p213, %r173, %r199; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p214, %r199, %r199; + setp.num.f32 %p215, %r199, %r199; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p216, %p153, %p215; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p217, %p212, %p216; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p218, %p153, %p214; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p219, %p213, %p218; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p220, %r368, %r200; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p221, %p220, %p219; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p222, %p217, %p221; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r201, %r173, %r199, %p222; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r202, %r368, %r200, %p222; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r203, %r201, 8, 31, -1; + shfl.sync.bfly.b32 %r204, %r202, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p223, %r201, %r203; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p224, %r201, %r203; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p225, %r201, %r201; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p226, %r203, %r203; + setp.num.f32 %p227, %r203, %r203; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p228, %p225, %p227; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p229, %p223, %p228; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p230, %p226, %p225; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p231, %p224, %p230; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p232, %r202, %r204; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p233, %p232, %p231; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p234, %p229, %p233; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r205, %r201, %r203, %p234; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r206, %r202, %r204, %p234; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r207, %r205, 4, 31, -1; + shfl.sync.bfly.b32 %r208, %r206, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p235, %r205, %r207; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p236, %r205, %r207; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p237, %r205, %r205; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p238, %r207, %r207; + setp.num.f32 %p239, %r207, %r207; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p240, %p237, %p239; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p241, %p235, %p240; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p242, %p238, %p237; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p243, %p236, %p242; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p244, %r206, %r208; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p245, %p244, %p243; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p246, %p241, %p245; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r209, %r205, %r207, %p246; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r210, %r206, %r208, %p246; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r211, %r209, 2, 31, -1; + shfl.sync.bfly.b32 %r212, %r210, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p247, %r209, %r211; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p248, %r209, %r211; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p249, %r209, %r209; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p250, %r211, %r211; + setp.num.f32 %p251, %r211, %r211; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p252, %p249, %p251; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p253, %p247, %p252; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p254, %p250, %p249; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p255, %p248, %p254; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p256, %r210, %r212; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p257, %p256, %p255; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p258, %p253, %p257; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r213, %r209, %r211, %p258; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r214, %r210, %r212, %p258; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r215, %r213, 1, 31, -1; + shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p259, %r213, %r215; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p260, %r213, %r215; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p261, %r213, %r213; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p262, %r215, %r215; + setp.num.f32 %p263, %r215, %r215; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p264, %p261, %p263; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p265, %p259, %p264; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p266, %p262, %p261; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p267, %p260, %p266; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p268, %r214, %r216; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p269, %p268, %p267; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p270, %p265, %p269; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r134, %r214, %r216, %p270; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r217, %r182, 16, 31, -1; + shfl.sync.bfly.b32 %r218, %r367, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p271, %r182, %r217; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p272, %r182, %r217; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p273, %r217, %r217; + setp.num.f32 %p274, %r217, %r217; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p275, %p152, %p274; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p276, %p271, %p275; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p277, %p152, %p273; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p278, %p272, %p277; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p279, %r367, %r218; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p280, %p279, %p278; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p281, %p276, %p280; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r219, %r182, %r217, %p281; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r220, %r367, %r218, %p281; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r221, %r219, 8, 31, -1; + shfl.sync.bfly.b32 %r222, %r220, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p282, %r219, %r221; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p283, %r219, %r221; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p284, %r219, %r219; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p285, %r221, %r221; + setp.num.f32 %p286, %r221, %r221; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p287, %p284, %p286; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p288, %p282, %p287; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p289, %p285, %p284; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p290, %p283, %p289; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p291, %r220, %r222; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p292, %p291, %p290; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p293, %p288, %p292; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r223, %r219, %r221, %p293; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r224, %r220, %r222, %p293; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r225, %r223, 4, 31, -1; + shfl.sync.bfly.b32 %r226, %r224, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p294, %r223, %r225; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p295, %r223, %r225; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p296, %r223, %r223; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p297, %r225, %r225; + setp.num.f32 %p298, %r225, %r225; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p299, %p296, %p298; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p300, %p294, %p299; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p301, %p297, %p296; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p302, %p295, %p301; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p303, %r224, %r226; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p304, %p303, %p302; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p305, %p300, %p304; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r227, %r223, %r225, %p305; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r228, %r224, %r226, %p305; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r229, %r227, 2, 31, -1; + shfl.sync.bfly.b32 %r230, %r228, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p306, %r227, %r229; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p307, %r227, %r229; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p308, %r227, %r227; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p309, %r229, %r229; + setp.num.f32 %p310, %r229, %r229; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p311, %p308, %p310; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p312, %p306, %p311; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p313, %p309, %p308; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p314, %p307, %p313; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p315, %r228, %r230; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p316, %p315, %p314; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p317, %p312, %p316; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r231, %r227, %r229, %p317; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r232, %r228, %r230, %p317; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r233, %r231, 1, 31, -1; + shfl.sync.bfly.b32 %r234, %r232, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p318, %r231, %r233; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p319, %r231, %r233; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p320, %r231, %r231; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p321, %r233, %r233; + setp.num.f32 %p322, %r233, %r233; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p323, %p320, %p322; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p324, %p318, %p323; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p325, %p321, %p320; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p326, %p319, %p325; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p327, %r232, %r234; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p328, %p327, %p326; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p329, %p324, %p328; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r138, %r232, %r234, %p329; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r235, %r181, 16, 31, -1; + shfl.sync.bfly.b32 %r236, %r366, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p330, %r181, %r235; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p331, %r181, %r235; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p332, %r235, %r235; + setp.num.f32 %p333, %r235, %r235; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p334, %p151, %p333; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p335, %p330, %p334; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p336, %p151, %p332; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p337, %p331, %p336; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p338, %r366, %r236; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p339, %p338, %p337; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p340, %p335, %p339; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r237, %r181, %r235, %p340; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r238, %r366, %r236, %p340; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r239, %r237, 8, 31, -1; + shfl.sync.bfly.b32 %r240, %r238, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p341, %r237, %r239; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p342, %r237, %r239; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p343, %r237, %r237; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p344, %r239, %r239; + setp.num.f32 %p345, %r239, %r239; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p346, %p343, %p345; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p347, %p341, %p346; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p348, %p344, %p343; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p349, %p342, %p348; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p350, %r238, %r240; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p351, %p350, %p349; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p352, %p347, %p351; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r241, %r237, %r239, %p352; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r242, %r238, %r240, %p352; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r243, %r241, 4, 31, -1; + shfl.sync.bfly.b32 %r244, %r242, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p353, %r241, %r243; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p354, %r241, %r243; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p355, %r241, %r241; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p356, %r243, %r243; + setp.num.f32 %p357, %r243, %r243; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p358, %p355, %p357; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p359, %p353, %p358; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p360, %p356, %p355; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p361, %p354, %p360; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p362, %r242, %r244; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p363, %p362, %p361; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p364, %p359, %p363; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r245, %r241, %r243, %p364; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r246, %r242, %r244, %p364; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r247, %r245, 2, 31, -1; + shfl.sync.bfly.b32 %r248, %r246, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p365, %r245, %r247; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p366, %r245, %r247; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p367, %r245, %r245; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p368, %r247, %r247; + setp.num.f32 %p369, %r247, %r247; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p370, %p367, %p369; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p371, %p365, %p370; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p372, %p368, %p367; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p373, %p366, %p372; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p374, %r246, %r248; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p375, %p374, %p373; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p376, %p371, %p375; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r249, %r245, %r247, %p376; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r250, %r246, %r248, %p376; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r251, %r249, 1, 31, -1; + shfl.sync.bfly.b32 %r252, %r250, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p377, %r249, %r251; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p378, %r249, %r251; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p379, %r249, %r249; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p380, %r251, %r251; + setp.num.f32 %p381, %r251, %r251; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p382, %p379, %p381; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p383, %p377, %p382; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p384, %p380, %p379; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p385, %p378, %p384; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p386, %r250, %r252; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p387, %p386, %p385; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p388, %p383, %p387; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r142, %r250, %r252, %p388; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r253, %r180, 16, 31, -1; + shfl.sync.bfly.b32 %r254, %r365, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p389, %r180, %r253; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p390, %r180, %r253; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p391, %r253, %r253; + setp.num.f32 %p392, %r253, %r253; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p393, %p150, %p392; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p394, %p389, %p393; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p395, %p150, %p391; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p396, %p390, %p395; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p397, %r365, %r254; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p398, %p397, %p396; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p399, %p394, %p398; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r255, %r180, %r253, %p399; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r256, %r365, %r254, %p399; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r257, %r255, 8, 31, -1; + shfl.sync.bfly.b32 %r258, %r256, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p400, %r255, %r257; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p401, %r255, %r257; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p402, %r255, %r255; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p403, %r257, %r257; + setp.num.f32 %p404, %r257, %r257; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p405, %p402, %p404; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p406, %p400, %p405; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p407, %p403, %p402; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p408, %p401, %p407; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p409, %r256, %r258; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p410, %p409, %p408; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p411, %p406, %p410; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r259, %r255, %r257, %p411; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r260, %r256, %r258, %p411; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r261, %r259, 4, 31, -1; + shfl.sync.bfly.b32 %r262, %r260, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p412, %r259, %r261; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p413, %r259, %r261; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p414, %r259, %r259; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p415, %r261, %r261; + setp.num.f32 %p416, %r261, %r261; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p417, %p414, %p416; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p418, %p412, %p417; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p419, %p415, %p414; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p420, %p413, %p419; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p421, %r260, %r262; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p422, %p421, %p420; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p423, %p418, %p422; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r263, %r259, %r261, %p423; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r264, %r260, %r262, %p423; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r265, %r263, 2, 31, -1; + shfl.sync.bfly.b32 %r266, %r264, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p424, %r263, %r265; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p425, %r263, %r265; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p426, %r263, %r263; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p427, %r265, %r265; + setp.num.f32 %p428, %r265, %r265; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p429, %p426, %p428; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p430, %p424, %p429; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p431, %p427, %p426; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p432, %p425, %p431; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p433, %r264, %r266; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p434, %p433, %p432; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p435, %p430, %p434; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r267, %r263, %r265, %p435; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r268, %r264, %r266, %p435; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r269, %r267, 1, 31, -1; + shfl.sync.bfly.b32 %r270, %r268, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p436, %r267, %r269; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p437, %r267, %r269; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p438, %r267, %r267; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p439, %r269, %r269; + setp.num.f32 %p440, %r269, %r269; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p441, %p438, %p440; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p442, %p436, %p441; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p443, %p439, %p438; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p444, %p437, %p443; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p445, %r268, %r270; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p446, %p445, %p444; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p447, %p442, %p446; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r146, %r268, %r270, %p447; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r271, %r179, 16, 31, -1; + shfl.sync.bfly.b32 %r272, %r364, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p448, %r179, %r271; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p449, %r179, %r271; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p450, %r271, %r271; + setp.num.f32 %p451, %r271, %r271; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p452, %p149, %p451; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p453, %p448, %p452; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p454, %p149, %p450; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p455, %p449, %p454; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p456, %r364, %r272; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p457, %p456, %p455; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p458, %p453, %p457; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r273, %r179, %r271, %p458; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r274, %r364, %r272, %p458; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r275, %r273, 8, 31, -1; + shfl.sync.bfly.b32 %r276, %r274, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p459, %r273, %r275; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p460, %r273, %r275; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p461, %r273, %r273; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p462, %r275, %r275; + setp.num.f32 %p463, %r275, %r275; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p464, %p461, %p463; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p465, %p459, %p464; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p466, %p462, %p461; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p467, %p460, %p466; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p468, %r274, %r276; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p469, %p468, %p467; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p470, %p465, %p469; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r277, %r273, %r275, %p470; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r278, %r274, %r276, %p470; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r279, %r277, 4, 31, -1; + shfl.sync.bfly.b32 %r280, %r278, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p471, %r277, %r279; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p472, %r277, %r279; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p473, %r277, %r277; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p474, %r279, %r279; + setp.num.f32 %p475, %r279, %r279; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p476, %p473, %p475; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p477, %p471, %p476; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p478, %p474, %p473; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p479, %p472, %p478; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p480, %r278, %r280; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p481, %p480, %p479; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p482, %p477, %p481; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r281, %r277, %r279, %p482; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r282, %r278, %r280, %p482; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r283, %r281, 2, 31, -1; + shfl.sync.bfly.b32 %r284, %r282, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p483, %r281, %r283; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p484, %r281, %r283; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p485, %r281, %r281; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p486, %r283, %r283; + setp.num.f32 %p487, %r283, %r283; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p488, %p485, %p487; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p489, %p483, %p488; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p490, %p486, %p485; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p491, %p484, %p490; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p492, %r282, %r284; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p493, %p492, %p491; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p494, %p489, %p493; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r285, %r281, %r283, %p494; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r286, %r282, %r284, %p494; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r287, %r285, 1, 31, -1; + shfl.sync.bfly.b32 %r288, %r286, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p495, %r285, %r287; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p496, %r285, %r287; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p497, %r285, %r285; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p498, %r287, %r287; + setp.num.f32 %p499, %r287, %r287; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p500, %p497, %p499; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p501, %p495, %p500; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p502, %p498, %p497; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p503, %p496, %p502; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p504, %r286, %r288; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p505, %p504, %p503; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p506, %p501, %p505; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r150, %r286, %r288, %p506; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r289, %r178, 16, 31, -1; + shfl.sync.bfly.b32 %r290, %r363, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p507, %r178, %r289; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p508, %r178, %r289; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p509, %r289, %r289; + setp.num.f32 %p510, %r289, %r289; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p511, %p148, %p510; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p512, %p507, %p511; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p513, %p148, %p509; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p514, %p508, %p513; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p515, %r363, %r290; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p516, %p515, %p514; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p517, %p512, %p516; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r291, %r178, %r289, %p517; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r292, %r363, %r290, %p517; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r293, %r291, 8, 31, -1; + shfl.sync.bfly.b32 %r294, %r292, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p518, %r291, %r293; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p519, %r291, %r293; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p520, %r291, %r291; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p521, %r293, %r293; + setp.num.f32 %p522, %r293, %r293; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p523, %p520, %p522; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p524, %p518, %p523; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p525, %p521, %p520; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p526, %p519, %p525; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p527, %r292, %r294; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p528, %p527, %p526; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p529, %p524, %p528; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r295, %r291, %r293, %p529; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r296, %r292, %r294, %p529; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r297, %r295, 4, 31, -1; + shfl.sync.bfly.b32 %r298, %r296, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p530, %r295, %r297; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p531, %r295, %r297; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p532, %r295, %r295; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p533, %r297, %r297; + setp.num.f32 %p534, %r297, %r297; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p535, %p532, %p534; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p536, %p530, %p535; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p537, %p533, %p532; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p538, %p531, %p537; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p539, %r296, %r298; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p540, %p539, %p538; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p541, %p536, %p540; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r299, %r295, %r297, %p541; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r300, %r296, %r298, %p541; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r301, %r299, 2, 31, -1; + shfl.sync.bfly.b32 %r302, %r300, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p542, %r299, %r301; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p543, %r299, %r301; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p544, %r299, %r299; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p545, %r301, %r301; + setp.num.f32 %p546, %r301, %r301; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p547, %p544, %p546; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p548, %p542, %p547; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p549, %p545, %p544; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p550, %p543, %p549; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p551, %r300, %r302; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p552, %p551, %p550; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p553, %p548, %p552; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r303, %r299, %r301, %p553; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r304, %r300, %r302, %p553; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r305, %r303, 1, 31, -1; + shfl.sync.bfly.b32 %r306, %r304, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p554, %r303, %r305; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p555, %r303, %r305; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p556, %r303, %r303; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p557, %r305, %r305; + setp.num.f32 %p558, %r305, %r305; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p559, %p556, %p558; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p560, %p554, %p559; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p561, %p557, %p556; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p562, %p555, %p561; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p563, %r304, %r306; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p564, %p563, %p562; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p565, %p560, %p564; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r154, %r304, %r306, %p565; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r307, %r177, 16, 31, -1; + shfl.sync.bfly.b32 %r308, %r362, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p566, %r177, %r307; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p567, %r177, %r307; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p568, %r307, %r307; + setp.num.f32 %p569, %r307, %r307; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p570, %p147, %p569; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p571, %p566, %p570; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p572, %p147, %p568; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p573, %p567, %p572; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p574, %r362, %r308; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p575, %p574, %p573; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p576, %p571, %p575; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r309, %r177, %r307, %p576; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r310, %r362, %r308, %p576; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r311, %r309, 8, 31, -1; + shfl.sync.bfly.b32 %r312, %r310, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p577, %r309, %r311; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p578, %r309, %r311; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p579, %r309, %r309; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p580, %r311, %r311; + setp.num.f32 %p581, %r311, %r311; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p582, %p579, %p581; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p583, %p577, %p582; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p584, %p580, %p579; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p585, %p578, %p584; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p586, %r310, %r312; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p587, %p586, %p585; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p588, %p583, %p587; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r313, %r309, %r311, %p588; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r314, %r310, %r312, %p588; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r315, %r313, 4, 31, -1; + shfl.sync.bfly.b32 %r316, %r314, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p589, %r313, %r315; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p590, %r313, %r315; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p591, %r313, %r313; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p592, %r315, %r315; + setp.num.f32 %p593, %r315, %r315; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p594, %p591, %p593; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p595, %p589, %p594; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p596, %p592, %p591; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p597, %p590, %p596; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p598, %r314, %r316; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p599, %p598, %p597; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p600, %p595, %p599; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r317, %r313, %r315, %p600; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r318, %r314, %r316, %p600; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r319, %r317, 2, 31, -1; + shfl.sync.bfly.b32 %r320, %r318, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p601, %r317, %r319; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p602, %r317, %r319; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p603, %r317, %r317; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p604, %r319, %r319; + setp.num.f32 %p605, %r319, %r319; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p606, %p603, %p605; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p607, %p601, %p606; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p608, %p604, %p603; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p609, %p602, %p608; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p610, %r318, %r320; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p611, %p610, %p609; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p612, %p607, %p611; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r321, %r317, %r319, %p612; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r322, %r318, %r320, %p612; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r323, %r321, 1, 31, -1; + shfl.sync.bfly.b32 %r324, %r322, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p613, %r321, %r323; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p614, %r321, %r323; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p615, %r321, %r321; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p616, %r323, %r323; + setp.num.f32 %p617, %r323, %r323; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p618, %p615, %p617; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p619, %p613, %p618; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p620, %p616, %p615; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p621, %p614, %p620; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p622, %r322, %r324; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p623, %p622, %p621; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p624, %p619, %p623; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r158, %r322, %r324, %p624; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.b32 %p123, %r172, 0; + shr.u32 %r325, %r1, 3; + and.b32 %r326, %r325, 60; + mov.b32 %r327, global_smem; + add.s32 %r127, %r327, %r326; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r128, %r195, %r197, %p211; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r127 + 0 ], %r128; + // end inline asm + add.s32 %r328, %r327, 512; + add.s32 %r129, %r328, %r326; + // begin inline asm + @%p123 st.shared.b32 [ %r129 + 0 ], %r130; + // end inline asm + and.b32 %r329, %r325, 124; + or.b32 %r330, %r329, 64; + add.s32 %r131, %r327, %r330; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r132, %r213, %r215, %p270; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r131 + 0 ], %r132; + // end inline asm + add.s32 %r133, %r328, %r330; + // begin inline asm + @%p123 st.shared.b32 [ %r133 + 0 ], %r134; + // end inline asm + add.s32 %r135, %r127, 128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r136, %r231, %r233, %p329; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r135 + 0 ], %r136; + // end inline asm + add.s32 %r137, %r129, 128; + // begin inline asm + @%p123 st.shared.b32 [ %r137 + 0 ], %r138; + // end inline asm + or.b32 %r331, %r329, 192; + add.s32 %r139, %r327, %r331; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r140, %r249, %r251, %p388; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r139 + 0 ], %r140; + // end inline asm + add.s32 %r141, %r328, %r331; + // begin inline asm + @%p123 st.shared.b32 [ %r141 + 0 ], %r142; + // end inline asm + add.s32 %r143, %r127, 256; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r144, %r267, %r269, %p447; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r143 + 0 ], %r144; + // end inline asm + add.s32 %r145, %r129, 256; + // begin inline asm + @%p123 st.shared.b32 [ %r145 + 0 ], %r146; + // end inline asm + or.b32 %r332, %r329, 320; + add.s32 %r147, %r327, %r332; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r148, %r285, %r287, %p506; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r147 + 0 ], %r148; + // end inline asm + add.s32 %r149, %r328, %r332; + // begin inline asm + @%p123 st.shared.b32 [ %r149 + 0 ], %r150; + // end inline asm + add.s32 %r151, %r127, 384; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r152, %r303, %r305, %p565; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r151 + 0 ], %r152; + // end inline asm + add.s32 %r153, %r129, 384; + // begin inline asm + @%p123 st.shared.b32 [ %r153 + 0 ], %r154; + // end inline asm + or.b32 %r333, %r329, 448; + add.s32 %r155, %r327, %r333; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r156, %r321, %r323, %p624; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p123 st.shared.b32 [ %r155 + 0 ], %r156; + // end inline asm + add.s32 %r157, %r328, %r333; + // begin inline asm + @%p123 st.shared.b32 [ %r157 + 0 ], %r158; + // end inline asm + bar.sync 0; + setp.lt.u32 %p139, %r1, 128; + shl.b32 %r334, %r1, 2; + add.s32 %r160, %r327, %r334; + // begin inline asm + @%p139 ld.shared.b32 %r159, [ %r160 + 0 ]; + // end inline asm + add.s32 %r162, %r328, %r334; + // begin inline asm + @%p139 ld.shared.b32 %r161, [ %r162 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r335, %r159, 8, 31, -1; + shfl.sync.bfly.b32 %r336, %r161, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p625, %r159, %r335; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p626, %r159, %r335; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p627, %r159, %r159; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p628, %r335, %r335; + setp.num.f32 %p629, %r335, %r335; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p630, %p627, %p629; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p631, %p625, %p630; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p632, %p627, %p628; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p633, %p626, %p632; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p634, %r161, %r336; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p635, %p634, %p633; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p636, %p631, %p635; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r337, %r159, %r335, %p636; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r338, %r161, %r336, %p636; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r339, %r337, 4, 31, -1; + shfl.sync.bfly.b32 %r340, %r338, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p637, %r337, %r339; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p638, %r337, %r339; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p639, %r337, %r337; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p640, %r339, %r339; + setp.num.f32 %p641, %r339, %r339; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p642, %p639, %p641; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p643, %p637, %p642; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p644, %p640, %p639; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p645, %p638, %p644; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p646, %r338, %r340; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p647, %p646, %p645; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p648, %p643, %p647; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r341, %r337, %r339, %p648; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r342, %r338, %r340, %p648; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r343, %r341, 2, 31, -1; + shfl.sync.bfly.b32 %r344, %r342, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p649, %r341, %r343; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p650, %r341, %r343; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p651, %r341, %r341; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p652, %r343, %r343; + setp.num.f32 %p653, %r343, %r343; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p654, %p651, %p653; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p655, %p649, %p654; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p656, %p652, %p651; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p657, %p650, %p656; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p658, %r342, %r344; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p659, %p658, %p657; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p660, %p655, %p659; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.f32 %r345, %r341, %r343, %p660; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r346, %r342, %r344, %p660; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + shfl.sync.bfly.b32 %r347, %r345, 1, 31, -1; + shfl.sync.bfly.b32 %r348, %r346, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.gt.f32 %p661, %r345, %r347; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.eq.f32 %p662, %r345, %r347; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p663, %r345, %r345; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.nan.f32 %p664, %r347, %r347; + setp.num.f32 %p665, %r347, %r347; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p666, %p663, %p665; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p667, %p661, %p666; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p668, %p664, %p663; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p669, %p662, %p668; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + setp.lt.s32 %p670, %r346, %r348; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.pred %p671, %p670, %p669; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + or.pred %p672, %p667, %p671; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r166, %r346, %r348, %p672; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + and.b32 %r349, %r1, 911; + setp.eq.b32 %p141, %r349, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + selp.b32 %r164, %r345, %r347, %p672; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ] + // begin inline asm + @%p141 st.shared.b32 [ %r160 + 0 ], %r164; + // end inline asm + // begin inline asm + @%p141 st.shared.b32 [ %r162 + 0 ], %r166; + // end inline asm + bar.sync 0; + ld.shared.b32 %r350, [global_smem+512]; + ld.shared.b32 %r351, [global_smem+576]; + ld.shared.b32 %r352, [global_smem+640]; + ld.shared.b32 %r353, [global_smem+704]; + ld.shared.b32 %r354, [global_smem+768]; + ld.shared.b32 %r355, [global_smem+832]; + ld.shared.b32 %r356, [global_smem+896]; + ld.shared.b32 %r357, [global_smem+960]; +$L__tmp5: + .loc 1 47 25 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:25 + mad.wide.s32 %rd184, %r171, 8, %rd79; + .loc 1 47 36 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:36 + bar.sync 0; + st.shared.v4.b32 [global_smem], {%r350, %r351, %r352, %r353}; + st.shared.v4.b32 [global_smem+16], {%r354, %r355, %r356, %r357}; + bar.sync 0; + shl.b32 %r358, %r1, 4; + and.b32 %r359, %r358, 16; + add.s32 %r168, %r327, %r359; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r167}, [%r168]; + // end inline asm + cvt.s64.s32 %rd183, %r167; + and.b32 %r360, %r1, 504; + setp.eq.b32 %p673, %r360, 0; + and.pred %p143, %p673, %p144; + // begin inline asm + @%p143 st.global.b64 [ %rd184 + 0 ], { %rd183 }; + // end inline asm + .loc 1 47 4 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:4 + ret; +$L__tmp6: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 119 +.b8 100 +.b8 104 +.b8 119 +.b8 108 +.b8 117 +.b8 54 +.b8 121 +.b8 98 +.b8 51 +.b8 119 +.b8 99 +.b8 119 +.b8 97 +.b8 122 +.b8 100 +.b8 110 +.b8 122 +.b8 109 +.b8 103 +.b8 122 +.b8 101 +.b8 119 +.b8 105 +.b8 101 +.b8 109 +.b8 118 +.b8 122 +.b8 110 +.b8 120 +.b8 118 +.b8 114 +.b8 114 +.b8 51 +.b8 53 +.b8 50 +.b8 53 +.b8 101 +.b8 111 +.b8 106 +.b8 117 +.b8 112 +.b8 113 +.b8 106 +.b8 108 +.b8 100 +.b8 111 +.b8 53 +.b8 112 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp0 // DW_AT_low_pc +.b64 $L__tmp3 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..72dfe1c51a0c941a8938df84954359dc60ab9f36 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.source @@ -0,0 +1,323 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 8 : i32 loc(#loc80) + %xoffset_2 = arith.constant 8 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<8x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<8x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c512_i32 = arith.constant 512 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c512_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x512xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x512xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<8x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<8x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x512xi32> to tensor<1x512xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_23 : tensor<1x512xi64> -> tensor<8x512xi64> loc(#loc95) + %tmp0_25 = tt.broadcast %tmp0_22 : tensor<8x1xi64> -> tensor<8x512xi64> loc(#loc95) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<8x512xi64> loc(#loc95) + %tmp0_27 = tt.splat %ks1 : i64 -> tensor<8x1xi64> loc(#loc96) + %tmp0_28 = arith.muli %tmp0_27, %x1_12 : tensor<8x1xi64> loc(#loc96) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<8x1xi64> -> tensor<8x512xi64> loc(#loc97) + %tmp0_30 = arith.addi %tmp0_26, %tmp0_29 : tensor<8x512xi64> loc(#loc97) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x512x!tt.ptr> loc(#loc98) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> loc(#loc98) + %tmp0_33 = tt.broadcast %r0_mask_19 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc99) + %tmp0_34 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x512xi1> loc(#loc99) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<8x512xi1> loc(#loc99) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc100) + %tmp0_38 = tt.load %tmp0_32, %tmp0_35, %tmp0_37 evictionPolicy = evict_first : tensor<8x512x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%_tmp2_16, %_tmp2_index_17, %tmp0_38, %r0_index_18) : (tensor<8x512xf32>, tensor<8x512xi32>, tensor<8x512xf32>, tensor<1x512xi32>) -> (tensor<8x512xf32>, tensor<8x512xi32>) loc(#loc24) + %_tmp2_39 = tt.broadcast %r0_mask_19 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc101) + %_tmp2_40 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x512xi1> loc(#loc101) + %_tmp2_41 = arith.andi %_tmp2_39, %_tmp2_40 : tensor<8x512xi1> loc(#loc101) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_16 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc102) + %_tmp2_index_43 = tt.broadcast %r0_mask_19 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc103) + %_tmp2_index_44 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x512xi1> loc(#loc103) + %_tmp2_index_45 = arith.andi %_tmp2_index_43, %_tmp2_index_44 : tensor<8x512xi1> loc(#loc103) + %_tmp2_index_46 = arith.select %_tmp2_index_45, %8#1, %_tmp2_index_17 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc104) + scf.yield %_tmp2_42, %_tmp2_index_46 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<8x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%a_value: tensor<8x512xf32> loc("a_value"(#loc35)), %a_index: tensor<8x512xi32> loc("a_index"(#loc35)), %b_value: tensor<8x512xf32> loc("b_value"(#loc35)), %b_index: tensor<1x512xi32> loc("b_index"(#loc35))) -> (tensor<8x512xf32>, tensor<8x512xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<8x512xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<8x512xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%a_value) : (tensor<8x512xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<8x512xi1>, tensor<8x512xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<8x512xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<8x512xf32> loc(#loc113) + %mask_4 = arith.constant true loc(#loc114) + %mask_5 = arith.constant dense : tensor<8x512xi1> loc(#loc114) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<8x512xi1> loc(#loc114) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<8x512xi1> loc(#loc115) + %mask_8 = arith.ori %mask, %mask_7 : tensor<8x512xi1> loc(#loc129) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc117) + %equal_10 = arith.ori %equal, %equal_9 : tensor<8x512xi1> loc(#loc130) + scf.yield %mask_8, %equal_10 : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc119) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<8x512xi32> loc(#loc119) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<8x512xi1> loc(#loc120) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<8x512xi1> loc(#loc121) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc51) + %3 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc52) + %4 = arith.select %mask_3, %a_index, %3 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc52) + tt.return %2, %4 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc53) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x512xf32> loc(#loc54) + %6 = ub.poison : tensor<8x512xi32> loc(#loc54) + tt.return %5, %6 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x) : (tensor<8x512xf32>) -> tensor<8x512xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc59))) -> tensor<8x512xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<8x512xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<8x512xf32> loc(#loc61) + tt.return %4 : tensor<8x512xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x512xf32> loc(#loc63) + tt.return %5 : tensor<8x512xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%value: tensor<8x512xf32> loc("value"(#loc68)), %index: tensor<8x512xi32> loc("index"(#loc68))) -> (tensor<8xf32>, tensor<8xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<8xf32>, tensor<8xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc71) + %2 = ub.poison : tensor<8xi32> loc(#loc71) + tt.return %1, %2 : tensor<8xf32>, tensor<8xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..15902a63e9834c56a1c2f7791ddae410432d4663 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,218 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc1 = loc(unknown) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc85 = loc(callsite(#loc1 at #loc39)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_1 = arith.constant dense : tensor<8x512xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x512xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2147483647> : tensor<8x512xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<8x512xf32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc50) + %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc51) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc52) + %xindex_6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc52) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc52) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc52) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked> loc(#loc53) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc53) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<8x1xi32, #blocked> loc(#loc53) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<8x1xi32, #blocked1> loc(#loc53) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked> loc(#loc54) + %xmask_13 = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked1> loc(#loc54) + %xmask_14 = arith.cmpi slt, %xindex_11, %xmask : tensor<8x1xi32, #blocked> loc(#loc54) + %xmask_15 = arith.cmpi slt, %xindex_12, %xmask_13 : tensor<8x1xi32, #blocked1> loc(#loc54) + %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc55) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x512xi32, #blocked> loc(#loc55) + %x0 = arith.extsi %xindex_11 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc56) + %x0_17 = tt.splat %ks0 : i64 -> tensor<8x1xi64, #blocked> loc(#loc56) + %x0_18 = arith.remsi %x0, %x0_17 : tensor<8x1xi64, #blocked> loc(#loc56) + %x1 = arith.divsi %x0, %x0_17 : tensor<8x1xi64, #blocked> loc(#loc57) + %tmp0 = arith.muli %x0_18, %cst : tensor<8x1xi64, #blocked> loc(#loc58) + %tmp0_19 = tt.broadcast %tmp0 : tensor<8x1xi64, #blocked> -> tensor<8x512xi64, #blocked> loc(#loc59) + %tmp0_20 = tt.splat %ks1 : i64 -> tensor<8x1xi64, #blocked> loc(#loc60) + %tmp0_21 = arith.muli %tmp0_20, %x1 : tensor<8x1xi64, #blocked> loc(#loc60) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<8x1xi64, #blocked> -> tensor<8x512xi64, #blocked> loc(#loc61) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x512x!tt.ptr, #blocked> loc(#loc62) + %tmp0_24 = tt.broadcast %xmask_14 : tensor<8x1xi1, #blocked> -> tensor<8x512xi1, #blocked> loc(#loc63) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2 = %cst_4, %_tmp2_index_25 = %cst_3) -> (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32, #blocked> loc(#loc65) + %r0_index_26 = arith.addi %r0_index, %r0_base_16 : tensor<1x512xi32, #blocked> loc(#loc65) + %r0_mask = arith.cmpi slt, %r0_index_26, %cst_2 : tensor<1x512xi32, #blocked> loc(#loc66) + %tmp0_27 = arith.extsi %r0_index_26 : tensor<1x512xi32, #blocked> to tensor<1x512xi64, #blocked> loc(#loc59) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x512xi64, #blocked> -> tensor<8x512xi64, #blocked> loc(#loc59) + %tmp0_29 = arith.addi %tmp0_28, %tmp0_19 : tensor<8x512xi64, #blocked> loc(#loc59) + %tmp0_30 = arith.addi %tmp0_29, %tmp0_22 : tensor<8x512xi64, #blocked> loc(#loc61) + %tmp0_31 = tt.addptr %tmp0_23, %tmp0_30 : tensor<8x512x!tt.ptr, #blocked>, tensor<8x512xi64, #blocked> loc(#loc62) + %tmp0_32 = tt.broadcast %r0_mask : tensor<1x512xi1, #blocked> -> tensor<8x512xi1, #blocked> loc(#loc63) + %tmp0_33 = arith.andi %tmp0_32, %tmp0_24 : tensor<8x512xi1, #blocked> loc(#loc63) + %tmp0_34 = tt.load %tmp0_31, %tmp0_33, %cst_0 evictionPolicy = evict_first : tensor<8x512x!tt.ptr, #blocked> loc(#loc67) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_34 : tensor<8x512xf32, #blocked> loc(#loc110) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_34 : tensor<8x512xf32, #blocked> loc(#loc111) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<8x512xf32, #blocked> loc(#loc90) + %b_isnan = arith.cmpf une, %tmp0_34, %tmp0_34 : tensor<8x512xf32, #blocked> loc(#loc91) + %mask_35 = arith.xori %b_isnan, %cst_1 : tensor<8x512xi1, #blocked> loc(#loc92) + %mask_36 = arith.andi %a_isnan, %mask_35 : tensor<8x512xi1, #blocked> loc(#loc93) + %mask_37 = arith.ori %mask, %mask_36 : tensor<8x512xi1, #blocked> loc(#loc112) + %equal_38 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1, #blocked> loc(#loc95) + %equal_39 = arith.ori %equal, %equal_38 : tensor<8x512xi1, #blocked> loc(#loc113) + %mask_40 = tt.broadcast %r0_index_26 : tensor<1x512xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc97) + %mask_41 = arith.cmpi slt, %_tmp2_index_25, %mask_40 : tensor<8x512xi32, #blocked> loc(#loc97) + %mask_42 = arith.andi %equal_39, %mask_41 : tensor<8x512xi1, #blocked> loc(#loc98) + %mask_43 = arith.ori %mask_37, %mask_42 : tensor<8x512xi1, #blocked> loc(#loc99) + %5 = arith.select %mask_43, %_tmp2, %tmp0_34 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc80) + %6 = arith.select %mask_43, %_tmp2_index_25, %mask_40 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc81) + %_tmp2_44 = arith.select %tmp0_33, %5, %_tmp2 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc82) + %_tmp2_index_45 = arith.select %tmp0_33, %6, %_tmp2_index_25 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc83) + scf.yield %_tmp2_44, %_tmp2_index_45 : tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked> loc(#loc37) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc39)), %arg7: i32 loc(callsite(#loc1 at #loc39)), %arg8: f32 loc(callsite(#loc1 at #loc39)), %arg9: i32 loc(callsite(#loc1 at #loc39))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc114) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc115) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc100) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc101) + %mask_25 = arith.xori %b_isnan, %true : i1 loc(#loc102) + %mask_26 = arith.andi %a_isnan, %mask_25 : i1 loc(#loc103) + %mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc116) + %equal_28 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc104) + %equal_29 = arith.ori %equal, %equal_28 : i1 loc(#loc117) + %mask_30 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc105) + %mask_31 = arith.andi %equal_29, %mask_30 : i1 loc(#loc106) + %mask_32 = arith.ori %mask_27, %mask_31 : i1 loc(#loc107) + %5 = arith.select %mask_32, %arg6, %arg8 : f32 loc(#loc108) + %6 = arith.select %mask_32, %arg7, %arg9 : i32 loc(#loc109) + tt.reduce.return %5, %6 : f32, i32 loc(#loc84) + }) : (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) -> (tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc84) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc86) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc41) + %2 = tt.addptr %1, %xindex_12 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc41) + %3 = ttg.convert_layout %tmp2 : tensor<8x1xi32, #blocked> -> tensor<8x1xi32, #blocked1> loc(#loc42) + %4 = arith.extsi %3 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc42) + tt.store %2, %4, %xmask_15 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc50 = loc("xoffset"(#loc2)) +#loc51 = loc("xoffset"(#loc3)) +#loc52 = loc("xindex"(#loc4)) +#loc53 = loc("xindex"(#loc5)) +#loc54 = loc("xmask"(#loc6)) +#loc55 = loc("r0_base"(#loc7)) +#loc56 = loc("x0"(#loc8)) +#loc57 = loc("x1"(#loc9)) +#loc58 = loc("tmp0"(#loc10)) +#loc59 = loc("tmp0"(#loc11)) +#loc60 = loc("tmp0"(#loc12)) +#loc61 = loc("tmp0"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("_tmp2"(#loc16)) +#loc65 = loc("r0_index"(#loc17)) +#loc66 = loc("r0_mask"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("mask"(#loc20)) +#loc69 = loc("equal"(#loc22)) +#loc70 = loc("a_isnan"(#loc23)) +#loc71 = loc("b_isnan"(#loc24)) +#loc72 = loc("mask"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("equal"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc(callsite(#loc33 at #loc21)) +#loc81 = loc(callsite(#loc34 at #loc21)) +#loc82 = loc("_tmp2"(#loc35)) +#loc83 = loc("_tmp2_index"(#loc36)) +#loc84 = loc(callsite(#loc38 at #loc39)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc64)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc("equal"(#loc69)) +#loc90 = loc(callsite(#loc70 at #loc21)) +#loc91 = loc(callsite(#loc71 at #loc21)) +#loc92 = loc(callsite(#loc72 at #loc21)) +#loc93 = loc(callsite(#loc73 at #loc21)) +#loc94 = loc("mask"(#loc74)) +#loc95 = loc(callsite(#loc75 at #loc21)) +#loc96 = loc("equal"(#loc76)) +#loc97 = loc(callsite(#loc77 at #loc21)) +#loc98 = loc(callsite(#loc78 at #loc21)) +#loc99 = loc(callsite(#loc79 at #loc21)) +#loc100 = loc(callsite(#loc70 at #loc84)) +#loc101 = loc(callsite(#loc71 at #loc84)) +#loc102 = loc(callsite(#loc72 at #loc84)) +#loc103 = loc(callsite(#loc73 at #loc84)) +#loc104 = loc(callsite(#loc75 at #loc84)) +#loc105 = loc(callsite(#loc77 at #loc84)) +#loc106 = loc(callsite(#loc78 at #loc84)) +#loc107 = loc(callsite(#loc79 at #loc84)) +#loc108 = loc(callsite(#loc33 at #loc84)) +#loc109 = loc(callsite(#loc34 at #loc84)) +#loc110 = loc(callsite(#loc88 at #loc21)) +#loc111 = loc(callsite(#loc89 at #loc21)) +#loc112 = loc(callsite(#loc94 at #loc21)) +#loc113 = loc(callsite(#loc96 at #loc21)) +#loc114 = loc(callsite(#loc88 at #loc84)) +#loc115 = loc(callsite(#loc89 at #loc84)) +#loc116 = loc(callsite(#loc94 at #loc84)) +#loc117 = loc(callsite(#loc96 at #loc84)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..87d34f9d3a85620a70e9978dbe2f2007ab744baa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/LB6GSIBNROJIH45A7E7GR2D3MKLQMRXRW4JCJKBX5IKCUYNTUE6Q/triton_red_fused_argmax_1.ttir @@ -0,0 +1,217 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75) +#loc48 = loc("in_ptr0"(#loc)) +#loc49 = loc("out_ptr0"(#loc)) +#loc50 = loc("ks0"(#loc)) +#loc51 = loc("ks1"(#loc)) +#loc52 = loc("xnumel"(#loc)) +#loc53 = loc("r0_numel"(#loc)) +#loc54 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc54) + %cst = arith.constant dense : tensor<8x512xi1> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<8x1xi64> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc55) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc56) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc57) + %xoffset_3 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc58) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc59) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc60) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc61) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc61) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc62) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc62) + %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc63) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc64) + %x0 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc65) + %x0_9 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc65) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<8x1xi64> loc(#loc65) + %x1 = arith.divsi %x0, %x0_9 : tensor<8x1xi64> loc(#loc66) + %_tmp2_index_11:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2_12 = %_tmp2, %_tmp2_index_13 = %_tmp2_index) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc68) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x512xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst_2 : tensor<1x512xi32> loc(#loc69) + %tmp0 = arith.muli %x0_10, %cst_1 : tensor<8x1xi64> loc(#loc70) + %tmp0_15 = arith.extsi %r0_index_14 : tensor<1x512xi32> to tensor<1x512xi64> loc(#loc71) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<1x512xi64> -> tensor<8x512xi64> loc(#loc71) + %tmp0_17 = tt.broadcast %tmp0 : tensor<8x1xi64> -> tensor<8x512xi64> loc(#loc71) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<8x512xi64> loc(#loc71) + %tmp0_19 = tt.splat %ks1 : i64 -> tensor<8x1xi64> loc(#loc72) + %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<8x1xi64> loc(#loc72) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<8x1xi64> -> tensor<8x512xi64> loc(#loc73) + %tmp0_22 = arith.addi %tmp0_18, %tmp0_21 : tensor<8x512xi64> loc(#loc73) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x512x!tt.ptr> loc(#loc74) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> loc(#loc74) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc75) + %tmp0_26 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x512xi1> loc(#loc75) + %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<8x512xi1> loc(#loc75) + %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<8x512x!tt.ptr> loc(#loc76) + %mask = arith.cmpf ogt, %_tmp2_12, %tmp0_28 : tensor<8x512xf32> loc(#loc118) + %equal = arith.cmpf oeq, %_tmp2_12, %tmp0_28 : tensor<8x512xf32> loc(#loc119) + %a_isnan = arith.cmpf une, %_tmp2_12, %_tmp2_12 : tensor<8x512xf32> loc(#loc98) + %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<8x512xf32> loc(#loc99) + %mask_29 = arith.xori %b_isnan, %cst : tensor<8x512xi1> loc(#loc100) + %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<8x512xi1> loc(#loc101) + %mask_31 = arith.ori %mask, %mask_30 : tensor<8x512xi1> loc(#loc120) + %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc103) + %equal_33 = arith.ori %equal, %equal_32 : tensor<8x512xi1> loc(#loc121) + %mask_34 = tt.broadcast %r0_index_14 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc105) + %mask_35 = arith.cmpi slt, %_tmp2_index_13, %mask_34 : tensor<8x512xi32> loc(#loc105) + %mask_36 = arith.andi %equal_33, %mask_35 : tensor<8x512xi1> loc(#loc106) + %mask_37 = arith.ori %mask_31, %mask_36 : tensor<8x512xi1> loc(#loc107) + %4 = arith.select %mask_37, %_tmp2_12, %tmp0_28 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc89) + %5 = arith.select %mask_37, %_tmp2_index_13, %mask_34 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc90) + %_tmp2_38 = arith.select %tmp0_27, %4, %_tmp2_12 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc91) + %_tmp2_index_39 = arith.select %tmp0_27, %5, %_tmp2_index_13 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc92) + scf.yield %_tmp2_38, %_tmp2_index_39 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc42) + } loc(#loc95) + %0:2 = "tt.reduce"(%_tmp2_index_11#0, %_tmp2_index_11#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc122) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc123) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc108) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc109) + %mask_12 = arith.xori %b_isnan, %true : i1 loc(#loc110) + %mask_13 = arith.andi %a_isnan, %mask_12 : i1 loc(#loc111) + %mask_14 = arith.ori %mask, %mask_13 : i1 loc(#loc124) + %equal_15 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc112) + %equal_16 = arith.ori %equal, %equal_15 : i1 loc(#loc125) + %mask_17 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc113) + %mask_18 = arith.andi %equal_16, %mask_17 : i1 loc(#loc114) + %mask_19 = arith.ori %mask_14, %mask_18 : i1 loc(#loc115) + %4 = arith.select %mask_19, %arg6, %arg8 : f32 loc(#loc116) + %5 = arith.select %mask_19, %arg7, %arg9 : i32 loc(#loc117) + tt.reduce.return %4, %5 : f32, i32 loc(#loc93) + }) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc93) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc94) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc45) + %2 = tt.addptr %1, %xindex_6 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc45) + %3 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc46) + tt.store %2, %3, %xmask_7 : tensor<8x1x!tt.ptr> loc(#loc46) + tt.return loc(#loc47) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4) +#loc55 = loc("_tmp2_index"(#loc4)) +#loc56 = loc("_tmp2"(#loc5)) +#loc57 = loc("xoffset"(#loc6)) +#loc58 = loc("xoffset"(#loc7)) +#loc59 = loc("xindex"(#loc8)) +#loc60 = loc("xindex"(#loc9)) +#loc61 = loc("xindex"(#loc10)) +#loc62 = loc("xmask"(#loc11)) +#loc63 = loc("r0_base"(#loc12)) +#loc64 = loc("r0_base"(#loc13)) +#loc65 = loc("x0"(#loc14)) +#loc66 = loc("x1"(#loc15)) +#loc67 = loc("_tmp2"(#loc3)) +#loc68 = loc("r0_index"(#loc16)) +#loc69 = loc("r0_mask"(#loc17)) +#loc70 = loc("tmp0"(#loc18)) +#loc71 = loc("tmp0"(#loc19)) +#loc72 = loc("tmp0"(#loc20)) +#loc73 = loc("tmp0"(#loc21)) +#loc74 = loc("tmp0"(#loc22)) +#loc75 = loc("tmp0"(#loc23)) +#loc76 = loc("tmp0"(#loc24)) +#loc77 = loc("mask"(#loc25)) +#loc78 = loc("equal"(#loc27)) +#loc79 = loc("a_isnan"(#loc28)) +#loc80 = loc("b_isnan"(#loc29)) +#loc81 = loc("mask"(#loc30)) +#loc82 = loc("mask"(#loc31)) +#loc83 = loc("mask"(#loc32)) +#loc84 = loc("equal"(#loc33)) +#loc85 = loc("equal"(#loc34)) +#loc86 = loc("mask"(#loc35)) +#loc87 = loc("mask"(#loc36)) +#loc88 = loc("mask"(#loc37)) +#loc89 = loc(callsite(#loc38 at #loc26)) +#loc90 = loc(callsite(#loc39 at #loc26)) +#loc91 = loc("_tmp2"(#loc40)) +#loc92 = loc("_tmp2_index"(#loc41)) +#loc93 = loc(callsite(#loc43 at #loc2)) +#loc94 = loc("tmp2"(#loc44)) +#loc95 = loc("_tmp2_index"(#loc67)) +#loc96 = loc("mask"(#loc77)) +#loc97 = loc("equal"(#loc78)) +#loc98 = loc(callsite(#loc79 at #loc26)) +#loc99 = loc(callsite(#loc80 at #loc26)) +#loc100 = loc(callsite(#loc81 at #loc26)) +#loc101 = loc(callsite(#loc82 at #loc26)) +#loc102 = loc("mask"(#loc83)) +#loc103 = loc(callsite(#loc84 at #loc26)) +#loc104 = loc("equal"(#loc85)) +#loc105 = loc(callsite(#loc86 at #loc26)) +#loc106 = loc(callsite(#loc87 at #loc26)) +#loc107 = loc(callsite(#loc88 at #loc26)) +#loc108 = loc(callsite(#loc79 at #loc93)) +#loc109 = loc(callsite(#loc80 at #loc93)) +#loc110 = loc(callsite(#loc81 at #loc93)) +#loc111 = loc(callsite(#loc82 at #loc93)) +#loc112 = loc(callsite(#loc84 at #loc93)) +#loc113 = loc(callsite(#loc86 at #loc93)) +#loc114 = loc(callsite(#loc87 at #loc93)) +#loc115 = loc(callsite(#loc88 at #loc93)) +#loc116 = loc(callsite(#loc38 at #loc93)) +#loc117 = loc(callsite(#loc39 at #loc93)) +#loc118 = loc(callsite(#loc96 at #loc26)) +#loc119 = loc(callsite(#loc97 at #loc26)) +#loc120 = loc(callsite(#loc102 at #loc26)) +#loc121 = loc(callsite(#loc104 at #loc26)) +#loc122 = loc(callsite(#loc96 at #loc93)) +#loc123 = loc(callsite(#loc97 at #loc93)) +#loc124 = loc(callsite(#loc102 at #loc93)) +#loc125 = loc(callsite(#loc104 at #loc93)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/__grp__log_softmax_backward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/__grp__log_softmax_backward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..f511bf04224ec317188b62a468760d35f4a11509 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/__grp__log_softmax_backward_kernel.json @@ -0,0 +1 @@ +{"child_paths": {"log_softmax_backward_kernel.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source", "log_softmax_backward_kernel.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir", "log_softmax_backward_kernel.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir", "log_softmax_backward_kernel.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir", "log_softmax_backward_kernel.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx", "log_softmax_backward_kernel.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin", "log_softmax_backward_kernel.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin new file mode 100644 index 0000000000000000000000000000000000000000..aba8281374f0cc56c16c368ea11488f641cde1fb Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..66a0758e7b9c6e3d7165485f360bc678675a9d10 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json @@ -0,0 +1 @@ +{"hash": "815544e5ace0796cfb02bac4b9dec6a2a110a987d2e79f99b0e8bb667807ed7c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "log_softmax_backward_kernel"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir new file mode 100644 index 0000000000000000000000000000000000000000..b1a6c62618b2e35b24ed837485a66d2bfc116e35 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir @@ -0,0 +1,913 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @log_softmax_backward_kernel(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) %4, ptr addrspace(1) %5, float %6, ptr addrspace(1) %7, ptr addrspace(1) %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #0 !dbg !4 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %14 = zext nneg i32 %13 to i64, !dbg !8 + %15 = sext i32 %1 to i64, !dbg !9 + %16 = mul nsw i64 %15, %14, !dbg !9 + %17 = getelementptr bfloat, ptr addrspace(1) %0, i64 %16, !dbg !10 + %18 = sext i32 %3 to i64, !dbg !11 + %19 = mul nsw i64 %18, %14, !dbg !11 + %20 = getelementptr float, ptr addrspace(1) %2, i64 %19, !dbg !12 + %21 = getelementptr i1, ptr addrspace(1) %4, i64 %14, !dbg !13 + %22 = tail call i8 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b8 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %21) #5, !dbg !14 + %.not = icmp eq i8 %22, 0, !dbg !14 + br i1 %.not, label %23, label %46, !dbg !15 + +23: ; preds = %12 + %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !16 + %25 = shl nuw nsw i32 %24, 3, !dbg !16 + %26 = icmp sgt i32 %9, 0, !dbg !17 + br i1 %26, label %.lr.ph6, label %.loopexit, !dbg !17 + +.lr.ph6: ; preds = %23, %.lr.ph6 + %27 = phi i32 [ %44, %.lr.ph6 ], [ 0, %23 ] + %28 = or disjoint i32 %27, %25, !dbg !18 + %29 = or disjoint i32 %28, 8192, !dbg !18 + %30 = or disjoint i32 %28, 16384, !dbg !18 + %31 = or disjoint i32 %28, 24576, !dbg !18 + %32 = icmp slt i32 %28, %9, !dbg !19 + %33 = icmp slt i32 %29, %9, !dbg !19 + %34 = icmp slt i32 %30, %9, !dbg !19 + %35 = icmp slt i32 %31, %9, !dbg !19 + %36 = sext i32 %28 to i64, !dbg !20 + %37 = getelementptr bfloat, ptr addrspace(1) %17, i64 %36, !dbg !20 + %38 = sext i32 %29 to i64, !dbg !20 + %39 = getelementptr bfloat, ptr addrspace(1) %17, i64 %38, !dbg !20 + %40 = sext i32 %30 to i64, !dbg !20 + %41 = getelementptr bfloat, ptr addrspace(1) %17, i64 %40, !dbg !20 + %42 = sext i32 %31 to i64, !dbg !20 + %43 = getelementptr bfloat, ptr addrspace(1) %17, i64 %42, !dbg !20 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %37, i1 %32) #5, !dbg !21 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i1 %33) #5, !dbg !21 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %41, i1 %34) #5, !dbg !21 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %43, i1 %35) #5, !dbg !21 + %44 = add i32 %27, 32768, !dbg !17 + %45 = icmp slt i32 %44, %9, !dbg !17 + br i1 %45, label %.lr.ph6, label %.loopexit, !dbg !17 + +.loopexit: ; preds = %.lr.ph5, %.lr.ph6, %46, %23 + ret void, !dbg !22 + +46: ; preds = %12 + %47 = getelementptr float, ptr addrspace(1) %7, i64 %14, !dbg !23 + %48 = getelementptr float, ptr addrspace(1) %8, i64 %14, !dbg !24 + %49 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %47) #5, !dbg !25 + %50 = bitcast i32 %49 to float, !dbg !25 + %51 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %48) #5, !dbg !26 + %52 = bitcast i32 %51 to float, !dbg !26 + %53 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %5) #5, !dbg !27 + %54 = bitcast i32 %53 to float, !dbg !27 + %55 = fmul float %6, %54, !dbg !28 + %56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !29 + %57 = shl nuw nsw i32 %56, 3, !dbg !29 + %58 = or disjoint i32 %57, 4, !dbg !29 + %59 = or disjoint i32 %57, 8192, !dbg !29 + %60 = or disjoint i32 %57, 8196, !dbg !29 + %61 = or disjoint i32 %57, 16384, !dbg !29 + %62 = or disjoint i32 %57, 16388, !dbg !29 + %63 = or disjoint i32 %57, 24576, !dbg !29 + %64 = or disjoint i32 %57, 24580, !dbg !29 + %65 = icmp sgt i32 %9, 0, !dbg !30 + br i1 %65, label %.lr.ph, label %.loopexit, !dbg !30 + +.lr.ph: ; preds = %46 + %66 = lshr i32 %56, 5, !dbg !29 + %67 = and i32 %56, 31, !dbg !29 + %68 = icmp eq i32 %67, 0 + %69 = getelementptr float, ptr addrspace(3) @global_smem, i32 %66 + %70 = icmp samesign ult i32 %56, 32 + %71 = getelementptr float, ptr addrspace(3) @global_smem, i32 %56 + %72 = icmp eq i32 %56, 0 + br label %73, !dbg !30 + +73: ; preds = %.lr.ph, %73 + %74 = phi float [ 0.000000e+00, %.lr.ph ], [ %308, %73 ] + %75 = phi i32 [ 0, %.lr.ph ], [ %309, %73 ] + %76 = or disjoint i32 %75, %57, !dbg !31 + %77 = or disjoint i32 %75, %58, !dbg !31 + %78 = or disjoint i32 %75, %59, !dbg !31 + %79 = or disjoint i32 %75, %60, !dbg !31 + %80 = or disjoint i32 %75, %61, !dbg !31 + %81 = or disjoint i32 %75, %62, !dbg !31 + %82 = or disjoint i32 %75, %63, !dbg !31 + %83 = or disjoint i32 %75, %64, !dbg !31 + %84 = icmp slt i32 %76, %9, !dbg !32 + %85 = icmp slt i32 %78, %9, !dbg !32 + %86 = icmp slt i32 %80, %9, !dbg !32 + %87 = icmp slt i32 %82, %9, !dbg !32 + %88 = sext i32 %76 to i64, !dbg !33 + %89 = getelementptr float, ptr addrspace(1) %20, i64 %88, !dbg !33 + %90 = sext i32 %77 to i64, !dbg !33 + %91 = getelementptr float, ptr addrspace(1) %20, i64 %90, !dbg !33 + %92 = sext i32 %78 to i64, !dbg !33 + %93 = getelementptr float, ptr addrspace(1) %20, i64 %92, !dbg !33 + %94 = sext i32 %79 to i64, !dbg !33 + %95 = getelementptr float, ptr addrspace(1) %20, i64 %94, !dbg !33 + %96 = sext i32 %80 to i64, !dbg !33 + %97 = getelementptr float, ptr addrspace(1) %20, i64 %96, !dbg !33 + %98 = sext i32 %81 to i64, !dbg !33 + %99 = getelementptr float, ptr addrspace(1) %20, i64 %98, !dbg !33 + %100 = sext i32 %82 to i64, !dbg !33 + %101 = getelementptr float, ptr addrspace(1) %20, i64 %100, !dbg !33 + %102 = sext i32 %83 to i64, !dbg !33 + %103 = getelementptr float, ptr addrspace(1) %20, i64 %102, !dbg !33 + %104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %89, i1 %84) #5, !dbg !34 + %105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !34 + %106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !34 + %107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !34 + %108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !34 + %109 = bitcast i32 %105 to float, !dbg !34 + %110 = bitcast i32 %106 to float, !dbg !34 + %111 = bitcast i32 %107 to float, !dbg !34 + %112 = bitcast i32 %108 to float, !dbg !34 + %113 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %91, i1 %84) #5, !dbg !34 + %114 = extractvalue { i32, i32, i32, i32 } %113, 0, !dbg !34 + %115 = extractvalue { i32, i32, i32, i32 } %113, 1, !dbg !34 + %116 = extractvalue { i32, i32, i32, i32 } %113, 2, !dbg !34 + %117 = extractvalue { i32, i32, i32, i32 } %113, 3, !dbg !34 + %118 = bitcast i32 %114 to float, !dbg !34 + %119 = bitcast i32 %115 to float, !dbg !34 + %120 = bitcast i32 %116 to float, !dbg !34 + %121 = bitcast i32 %117 to float, !dbg !34 + %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %93, i1 %85) #5, !dbg !34 + %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !34 + %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !34 + %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !34 + %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !34 + %127 = bitcast i32 %123 to float, !dbg !34 + %128 = bitcast i32 %124 to float, !dbg !34 + %129 = bitcast i32 %125 to float, !dbg !34 + %130 = bitcast i32 %126 to float, !dbg !34 + %131 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %95, i1 %85) #5, !dbg !34 + %132 = extractvalue { i32, i32, i32, i32 } %131, 0, !dbg !34 + %133 = extractvalue { i32, i32, i32, i32 } %131, 1, !dbg !34 + %134 = extractvalue { i32, i32, i32, i32 } %131, 2, !dbg !34 + %135 = extractvalue { i32, i32, i32, i32 } %131, 3, !dbg !34 + %136 = bitcast i32 %132 to float, !dbg !34 + %137 = bitcast i32 %133 to float, !dbg !34 + %138 = bitcast i32 %134 to float, !dbg !34 + %139 = bitcast i32 %135 to float, !dbg !34 + %140 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %97, i1 %86) #5, !dbg !34 + %141 = extractvalue { i32, i32, i32, i32 } %140, 0, !dbg !34 + %142 = extractvalue { i32, i32, i32, i32 } %140, 1, !dbg !34 + %143 = extractvalue { i32, i32, i32, i32 } %140, 2, !dbg !34 + %144 = extractvalue { i32, i32, i32, i32 } %140, 3, !dbg !34 + %145 = bitcast i32 %141 to float, !dbg !34 + %146 = bitcast i32 %142 to float, !dbg !34 + %147 = bitcast i32 %143 to float, !dbg !34 + %148 = bitcast i32 %144 to float, !dbg !34 + %149 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %99, i1 %86) #5, !dbg !34 + %150 = extractvalue { i32, i32, i32, i32 } %149, 0, !dbg !34 + %151 = extractvalue { i32, i32, i32, i32 } %149, 1, !dbg !34 + %152 = extractvalue { i32, i32, i32, i32 } %149, 2, !dbg !34 + %153 = extractvalue { i32, i32, i32, i32 } %149, 3, !dbg !34 + %154 = bitcast i32 %150 to float, !dbg !34 + %155 = bitcast i32 %151 to float, !dbg !34 + %156 = bitcast i32 %152 to float, !dbg !34 + %157 = bitcast i32 %153 to float, !dbg !34 + %158 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %101, i1 %87) #5, !dbg !34 + %159 = extractvalue { i32, i32, i32, i32 } %158, 0, !dbg !34 + %160 = extractvalue { i32, i32, i32, i32 } %158, 1, !dbg !34 + %161 = extractvalue { i32, i32, i32, i32 } %158, 2, !dbg !34 + %162 = extractvalue { i32, i32, i32, i32 } %158, 3, !dbg !34 + %163 = bitcast i32 %159 to float, !dbg !34 + %164 = bitcast i32 %160 to float, !dbg !34 + %165 = bitcast i32 %161 to float, !dbg !34 + %166 = bitcast i32 %162 to float, !dbg !34 + %167 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %103, i1 %87) #5, !dbg !34 + %168 = extractvalue { i32, i32, i32, i32 } %167, 0, !dbg !34 + %169 = extractvalue { i32, i32, i32, i32 } %167, 1, !dbg !34 + %170 = extractvalue { i32, i32, i32, i32 } %167, 2, !dbg !34 + %171 = extractvalue { i32, i32, i32, i32 } %167, 3, !dbg !34 + %172 = bitcast i32 %168 to float, !dbg !34 + %173 = bitcast i32 %169 to float, !dbg !34 + %174 = bitcast i32 %170 to float, !dbg !34 + %175 = bitcast i32 %171 to float, !dbg !34 + %176 = fmul float %55, %109, !dbg !35 + %177 = fmul float %55, %110, !dbg !35 + %178 = fmul float %55, %111, !dbg !35 + %179 = fmul float %55, %112, !dbg !35 + %180 = fmul float %55, %118, !dbg !35 + %181 = fmul float %55, %119, !dbg !35 + %182 = fmul float %55, %120, !dbg !35 + %183 = fmul float %55, %121, !dbg !35 + %184 = fmul float %55, %127, !dbg !35 + %185 = fmul float %55, %128, !dbg !35 + %186 = fmul float %55, %129, !dbg !35 + %187 = fmul float %55, %130, !dbg !35 + %188 = fmul float %55, %136, !dbg !35 + %189 = fmul float %55, %137, !dbg !35 + %190 = fmul float %55, %138, !dbg !35 + %191 = fmul float %55, %139, !dbg !35 + %192 = fmul float %55, %145, !dbg !35 + %193 = fmul float %55, %146, !dbg !35 + %194 = fmul float %55, %147, !dbg !35 + %195 = fmul float %55, %148, !dbg !35 + %196 = fmul float %55, %154, !dbg !35 + %197 = fmul float %55, %155, !dbg !35 + %198 = fmul float %55, %156, !dbg !35 + %199 = fmul float %55, %157, !dbg !35 + %200 = fmul float %55, %163, !dbg !35 + %201 = fmul float %55, %164, !dbg !35 + %202 = fmul float %55, %165, !dbg !35 + %203 = fmul float %55, %166, !dbg !35 + %204 = fmul float %55, %172, !dbg !35 + %205 = fmul float %55, %173, !dbg !35 + %206 = fmul float %55, %174, !dbg !35 + %207 = fmul float %55, %175, !dbg !35 + %208 = select i1 %85, float %184, float 0.000000e+00, !dbg !36 + %209 = select i1 %85, float %185, float 0.000000e+00, !dbg !36 + %210 = select i1 %85, float %186, float 0.000000e+00, !dbg !36 + %211 = select i1 %85, float %187, float 0.000000e+00, !dbg !36 + %212 = select i1 %85, float %188, float 0.000000e+00, !dbg !36 + %213 = select i1 %85, float %189, float 0.000000e+00, !dbg !36 + %214 = select i1 %85, float %190, float 0.000000e+00, !dbg !36 + %215 = select i1 %85, float %191, float 0.000000e+00, !dbg !36 + %216 = select i1 %86, float %192, float 0.000000e+00, !dbg !36 + %217 = select i1 %86, float %193, float 0.000000e+00, !dbg !36 + %218 = select i1 %86, float %194, float 0.000000e+00, !dbg !36 + %219 = select i1 %86, float %195, float 0.000000e+00, !dbg !36 + %220 = select i1 %86, float %196, float 0.000000e+00, !dbg !36 + %221 = select i1 %86, float %197, float 0.000000e+00, !dbg !36 + %222 = select i1 %86, float %198, float 0.000000e+00, !dbg !36 + %223 = select i1 %86, float %199, float 0.000000e+00, !dbg !36 + %224 = select i1 %87, float %200, float 0.000000e+00, !dbg !36 + %225 = select i1 %87, float %201, float 0.000000e+00, !dbg !36 + %226 = select i1 %87, float %202, float 0.000000e+00, !dbg !36 + %227 = select i1 %87, float %203, float 0.000000e+00, !dbg !36 + %228 = select i1 %87, float %204, float 0.000000e+00, !dbg !36 + %229 = select i1 %87, float %205, float 0.000000e+00, !dbg !36 + %230 = select i1 %87, float %206, float 0.000000e+00, !dbg !36 + %231 = select i1 %87, float %207, float 0.000000e+00, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %232 = fadd float %176, %177, !dbg !41 + %233 = fadd float %178, %232, !dbg !41 + %234 = fadd float %179, %233, !dbg !41 + %235 = fadd float %180, %234, !dbg !41 + %236 = fadd float %181, %235, !dbg !41 + %237 = fadd float %182, %236, !dbg !41 + %238 = fadd float %183, %237, !dbg !41 + %239 = select i1 %84, float %238, float 0.000000e+00, !dbg !41 + %240 = fadd float %208, %239, !dbg !41 + %241 = fadd float %209, %240, !dbg !41 + %242 = fadd float %210, %241, !dbg !41 + %243 = fadd float %211, %242, !dbg !41 + %244 = fadd float %212, %243, !dbg !41 + %245 = fadd float %213, %244, !dbg !41 + %246 = fadd float %214, %245, !dbg !41 + %247 = fadd float %215, %246, !dbg !41 + %248 = fadd float %216, %247, !dbg !41 + %249 = fadd float %217, %248, !dbg !41 + %250 = fadd float %218, %249, !dbg !41 + %251 = fadd float %219, %250, !dbg !41 + %252 = fadd float %220, %251, !dbg !41 + %253 = fadd float %221, %252, !dbg !41 + %254 = fadd float %222, %253, !dbg !41 + %255 = fadd float %223, %254, !dbg !41 + %256 = fadd float %224, %255, !dbg !41 + %257 = fadd float %225, %256, !dbg !41 + %258 = fadd float %226, %257, !dbg !41 + %259 = fadd float %227, %258, !dbg !41 + %260 = fadd float %228, %259, !dbg !41 + %261 = fadd float %229, %260, !dbg !41 + %262 = fadd float %230, %261, !dbg !41 + %263 = fadd float %231, %262, !dbg !41 + %264 = bitcast float %263 to i32, !dbg !37 + %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 16, i32 31), !dbg !37 + %266 = bitcast i32 %265 to float, !dbg !37 + %267 = fadd float %263, %266, !dbg !41 + %268 = bitcast float %267 to i32, !dbg !37 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 8, i32 31), !dbg !37 + %270 = bitcast i32 %269 to float, !dbg !37 + %271 = fadd float %267, %270, !dbg !41 + %272 = bitcast float %271 to i32, !dbg !37 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !37 + %274 = bitcast i32 %273 to float, !dbg !37 + %275 = fadd float %271, %274, !dbg !41 + %276 = bitcast float %275 to i32, !dbg !37 + %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !37 + %278 = bitcast i32 %277 to float, !dbg !37 + %279 = fadd float %275, %278, !dbg !41 + %280 = bitcast float %279 to i32, !dbg !37 + %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 1, i32 31), !dbg !37 + %282 = bitcast i32 %281 to float, !dbg !37 + %283 = fadd float %279, %282, !dbg !41 + %284 = bitcast float %283 to <1 x i32>, !dbg !37 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, <1 x i32> %284, i1 %68) #5, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %285 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %71, i1 %70) #5, !dbg !37 + %286 = bitcast i32 %285 to float, !dbg !37 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 16, i32 31), !dbg !37 + %288 = bitcast i32 %287 to float, !dbg !37 + %289 = fadd float %286, %288, !dbg !41 + %290 = bitcast float %289 to i32, !dbg !37 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 8, i32 31), !dbg !37 + %292 = bitcast i32 %291 to float, !dbg !37 + %293 = fadd float %289, %292, !dbg !41 + %294 = bitcast float %293 to i32, !dbg !37 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 4, i32 31), !dbg !37 + %296 = bitcast i32 %295 to float, !dbg !37 + %297 = fadd float %293, %296, !dbg !41 + %298 = bitcast float %297 to i32, !dbg !37 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !37 + %300 = bitcast i32 %299 to float, !dbg !37 + %301 = fadd float %297, %300, !dbg !41 + %302 = bitcast float %301 to i32, !dbg !37 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 1, i32 31), !dbg !37 + %304 = bitcast i32 %303 to float, !dbg !37 + %305 = fadd float %301, %304, !dbg !41 + %306 = bitcast float %305 to <1 x i32>, !dbg !37 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %71, <1 x i32> %306, i1 %72) #5, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %307 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !37 + %308 = fadd float %74, %307, !dbg !42 + %309 = add i32 %75, 32768, !dbg !30 + %310 = icmp slt i32 %309, %9, !dbg !30 + br i1 %310, label %73, label %.lr.ph5.preheader, !dbg !30 + +.lr.ph5.preheader: ; preds = %73 + %311 = insertelement <2 x float> poison, float %308, i64 0 + %312 = shufflevector <2 x float> %311, <2 x float> poison, <2 x i32> zeroinitializer + %313 = insertelement <2 x float> poison, float %55, i64 0 + %314 = shufflevector <2 x float> %313, <2 x float> poison, <2 x i32> zeroinitializer + br label %.lr.ph5, !dbg !43 + +.lr.ph5: ; preds = %.lr.ph5.preheader, %.lr.ph5 + %315 = phi i32 [ %792, %.lr.ph5 ], [ 0, %.lr.ph5.preheader ] + %316 = or disjoint i32 %315, %57, !dbg !44 + %317 = or disjoint i32 %315, %58, !dbg !44 + %318 = or disjoint i32 %315, %59, !dbg !44 + %319 = or disjoint i32 %315, %60, !dbg !44 + %320 = or disjoint i32 %315, %61, !dbg !44 + %321 = or disjoint i32 %315, %62, !dbg !44 + %322 = or disjoint i32 %315, %63, !dbg !44 + %323 = or disjoint i32 %315, %64, !dbg !44 + %324 = icmp slt i32 %316, %9, !dbg !45 + %325 = icmp slt i32 %318, %9, !dbg !45 + %326 = icmp slt i32 %320, %9, !dbg !45 + %327 = icmp slt i32 %322, %9, !dbg !45 + %328 = sext i32 %316 to i64, !dbg !46 + %329 = getelementptr bfloat, ptr addrspace(1) %17, i64 %328, !dbg !46 + %330 = sext i32 %318 to i64, !dbg !46 + %331 = getelementptr bfloat, ptr addrspace(1) %17, i64 %330, !dbg !46 + %332 = sext i32 %320 to i64, !dbg !46 + %333 = getelementptr bfloat, ptr addrspace(1) %17, i64 %332, !dbg !46 + %334 = sext i32 %322 to i64, !dbg !46 + %335 = getelementptr bfloat, ptr addrspace(1) %17, i64 %334, !dbg !46 + %336 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %329, i1 %324) #5, !dbg !47 + %337 = extractvalue { i32, i32, i32, i32 } %336, 0, !dbg !47 + %338 = bitcast i32 %337 to <2 x bfloat>, !dbg !47 + %339 = extractvalue { i32, i32, i32, i32 } %336, 1, !dbg !47 + %340 = bitcast i32 %339 to <2 x bfloat>, !dbg !47 + %341 = extractvalue { i32, i32, i32, i32 } %336, 2, !dbg !47 + %342 = bitcast i32 %341 to <2 x bfloat>, !dbg !47 + %343 = extractvalue { i32, i32, i32, i32 } %336, 3, !dbg !47 + %344 = bitcast i32 %343 to <2 x bfloat>, !dbg !47 + %345 = extractelement <2 x bfloat> %338, i64 0, !dbg !47 + %346 = extractelement <2 x bfloat> %338, i64 1, !dbg !47 + %347 = extractelement <2 x bfloat> %340, i64 0, !dbg !47 + %348 = extractelement <2 x bfloat> %340, i64 1, !dbg !47 + %349 = extractelement <2 x bfloat> %342, i64 0, !dbg !47 + %350 = extractelement <2 x bfloat> %342, i64 1, !dbg !47 + %351 = extractelement <2 x bfloat> %344, i64 0, !dbg !47 + %352 = extractelement <2 x bfloat> %344, i64 1, !dbg !47 + %353 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %331, i1 %325) #5, !dbg !47 + %354 = extractvalue { i32, i32, i32, i32 } %353, 0, !dbg !47 + %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !47 + %356 = extractvalue { i32, i32, i32, i32 } %353, 1, !dbg !47 + %357 = bitcast i32 %356 to <2 x bfloat>, !dbg !47 + %358 = extractvalue { i32, i32, i32, i32 } %353, 2, !dbg !47 + %359 = bitcast i32 %358 to <2 x bfloat>, !dbg !47 + %360 = extractvalue { i32, i32, i32, i32 } %353, 3, !dbg !47 + %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !47 + %362 = extractelement <2 x bfloat> %355, i64 0, !dbg !47 + %363 = extractelement <2 x bfloat> %355, i64 1, !dbg !47 + %364 = extractelement <2 x bfloat> %357, i64 0, !dbg !47 + %365 = extractelement <2 x bfloat> %357, i64 1, !dbg !47 + %366 = extractelement <2 x bfloat> %359, i64 0, !dbg !47 + %367 = extractelement <2 x bfloat> %359, i64 1, !dbg !47 + %368 = extractelement <2 x bfloat> %361, i64 0, !dbg !47 + %369 = extractelement <2 x bfloat> %361, i64 1, !dbg !47 + %370 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %333, i1 %326) #5, !dbg !47 + %371 = extractvalue { i32, i32, i32, i32 } %370, 0, !dbg !47 + %372 = bitcast i32 %371 to <2 x bfloat>, !dbg !47 + %373 = extractvalue { i32, i32, i32, i32 } %370, 1, !dbg !47 + %374 = bitcast i32 %373 to <2 x bfloat>, !dbg !47 + %375 = extractvalue { i32, i32, i32, i32 } %370, 2, !dbg !47 + %376 = bitcast i32 %375 to <2 x bfloat>, !dbg !47 + %377 = extractvalue { i32, i32, i32, i32 } %370, 3, !dbg !47 + %378 = bitcast i32 %377 to <2 x bfloat>, !dbg !47 + %379 = extractelement <2 x bfloat> %372, i64 0, !dbg !47 + %380 = extractelement <2 x bfloat> %372, i64 1, !dbg !47 + %381 = extractelement <2 x bfloat> %374, i64 0, !dbg !47 + %382 = extractelement <2 x bfloat> %374, i64 1, !dbg !47 + %383 = extractelement <2 x bfloat> %376, i64 0, !dbg !47 + %384 = extractelement <2 x bfloat> %376, i64 1, !dbg !47 + %385 = extractelement <2 x bfloat> %378, i64 0, !dbg !47 + %386 = extractelement <2 x bfloat> %378, i64 1, !dbg !47 + %387 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %335, i1 %327) #5, !dbg !47 + %388 = extractvalue { i32, i32, i32, i32 } %387, 0, !dbg !47 + %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !47 + %390 = extractvalue { i32, i32, i32, i32 } %387, 1, !dbg !47 + %391 = bitcast i32 %390 to <2 x bfloat>, !dbg !47 + %392 = extractvalue { i32, i32, i32, i32 } %387, 2, !dbg !47 + %393 = bitcast i32 %392 to <2 x bfloat>, !dbg !47 + %394 = extractvalue { i32, i32, i32, i32 } %387, 3, !dbg !47 + %395 = bitcast i32 %394 to <2 x bfloat>, !dbg !47 + %396 = extractelement <2 x bfloat> %389, i64 0, !dbg !47 + %397 = extractelement <2 x bfloat> %389, i64 1, !dbg !47 + %398 = extractelement <2 x bfloat> %391, i64 0, !dbg !47 + %399 = extractelement <2 x bfloat> %391, i64 1, !dbg !47 + %400 = extractelement <2 x bfloat> %393, i64 0, !dbg !47 + %401 = extractelement <2 x bfloat> %393, i64 1, !dbg !47 + %402 = extractelement <2 x bfloat> %395, i64 0, !dbg !47 + %403 = extractelement <2 x bfloat> %395, i64 1, !dbg !47 + %404 = fpext bfloat %345 to float, !dbg !48 + %405 = fpext bfloat %346 to float, !dbg !48 + %406 = fpext bfloat %347 to float, !dbg !48 + %407 = fpext bfloat %348 to float, !dbg !48 + %408 = fpext bfloat %349 to float, !dbg !48 + %409 = fpext bfloat %350 to float, !dbg !48 + %410 = fpext bfloat %351 to float, !dbg !48 + %411 = fpext bfloat %352 to float, !dbg !48 + %412 = fpext bfloat %362 to float, !dbg !48 + %413 = fpext bfloat %363 to float, !dbg !48 + %414 = fpext bfloat %364 to float, !dbg !48 + %415 = fpext bfloat %365 to float, !dbg !48 + %416 = fpext bfloat %366 to float, !dbg !48 + %417 = fpext bfloat %367 to float, !dbg !48 + %418 = fpext bfloat %368 to float, !dbg !48 + %419 = fpext bfloat %369 to float, !dbg !48 + %420 = fpext bfloat %379 to float, !dbg !48 + %421 = fpext bfloat %380 to float, !dbg !48 + %422 = fpext bfloat %381 to float, !dbg !48 + %423 = fpext bfloat %382 to float, !dbg !48 + %424 = fpext bfloat %383 to float, !dbg !48 + %425 = fpext bfloat %384 to float, !dbg !48 + %426 = fpext bfloat %385 to float, !dbg !48 + %427 = fpext bfloat %386 to float, !dbg !48 + %428 = fpext bfloat %396 to float, !dbg !48 + %429 = fpext bfloat %397 to float, !dbg !48 + %430 = fpext bfloat %398 to float, !dbg !48 + %431 = fpext bfloat %399 to float, !dbg !48 + %432 = fpext bfloat %400 to float, !dbg !48 + %433 = fpext bfloat %401 to float, !dbg !48 + %434 = fpext bfloat %402 to float, !dbg !48 + %435 = fpext bfloat %403 to float, !dbg !48 + %436 = getelementptr float, ptr addrspace(1) %20, i64 %328, !dbg !49 + %437 = sext i32 %317 to i64, !dbg !49 + %438 = getelementptr float, ptr addrspace(1) %20, i64 %437, !dbg !49 + %439 = getelementptr float, ptr addrspace(1) %20, i64 %330, !dbg !49 + %440 = sext i32 %319 to i64, !dbg !49 + %441 = getelementptr float, ptr addrspace(1) %20, i64 %440, !dbg !49 + %442 = getelementptr float, ptr addrspace(1) %20, i64 %332, !dbg !49 + %443 = sext i32 %321 to i64, !dbg !49 + %444 = getelementptr float, ptr addrspace(1) %20, i64 %443, !dbg !49 + %445 = getelementptr float, ptr addrspace(1) %20, i64 %334, !dbg !49 + %446 = sext i32 %323 to i64, !dbg !49 + %447 = getelementptr float, ptr addrspace(1) %20, i64 %446, !dbg !49 + %448 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %436, i1 %324) #5, !dbg !50 + %449 = extractvalue { i32, i32, i32, i32 } %448, 0, !dbg !50 + %450 = extractvalue { i32, i32, i32, i32 } %448, 1, !dbg !50 + %451 = extractvalue { i32, i32, i32, i32 } %448, 2, !dbg !50 + %452 = extractvalue { i32, i32, i32, i32 } %448, 3, !dbg !50 + %453 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %438, i1 %324) #5, !dbg !50 + %454 = extractvalue { i32, i32, i32, i32 } %453, 0, !dbg !50 + %455 = extractvalue { i32, i32, i32, i32 } %453, 1, !dbg !50 + %456 = extractvalue { i32, i32, i32, i32 } %453, 2, !dbg !50 + %457 = extractvalue { i32, i32, i32, i32 } %453, 3, !dbg !50 + %458 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %439, i1 %325) #5, !dbg !50 + %459 = extractvalue { i32, i32, i32, i32 } %458, 0, !dbg !50 + %460 = extractvalue { i32, i32, i32, i32 } %458, 1, !dbg !50 + %461 = extractvalue { i32, i32, i32, i32 } %458, 2, !dbg !50 + %462 = extractvalue { i32, i32, i32, i32 } %458, 3, !dbg !50 + %463 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %441, i1 %325) #5, !dbg !50 + %464 = extractvalue { i32, i32, i32, i32 } %463, 0, !dbg !50 + %465 = extractvalue { i32, i32, i32, i32 } %463, 1, !dbg !50 + %466 = extractvalue { i32, i32, i32, i32 } %463, 2, !dbg !50 + %467 = extractvalue { i32, i32, i32, i32 } %463, 3, !dbg !50 + %468 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %442, i1 %326) #5, !dbg !50 + %469 = extractvalue { i32, i32, i32, i32 } %468, 0, !dbg !50 + %470 = extractvalue { i32, i32, i32, i32 } %468, 1, !dbg !50 + %471 = extractvalue { i32, i32, i32, i32 } %468, 2, !dbg !50 + %472 = extractvalue { i32, i32, i32, i32 } %468, 3, !dbg !50 + %473 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %444, i1 %326) #5, !dbg !50 + %474 = extractvalue { i32, i32, i32, i32 } %473, 0, !dbg !50 + %475 = extractvalue { i32, i32, i32, i32 } %473, 1, !dbg !50 + %476 = extractvalue { i32, i32, i32, i32 } %473, 2, !dbg !50 + %477 = extractvalue { i32, i32, i32, i32 } %473, 3, !dbg !50 + %478 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %445, i1 %327) #5, !dbg !50 + %479 = extractvalue { i32, i32, i32, i32 } %478, 0, !dbg !50 + %480 = extractvalue { i32, i32, i32, i32 } %478, 1, !dbg !50 + %481 = extractvalue { i32, i32, i32, i32 } %478, 2, !dbg !50 + %482 = extractvalue { i32, i32, i32, i32 } %478, 3, !dbg !50 + %483 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %447, i1 %327) #5, !dbg !50 + %484 = extractvalue { i32, i32, i32, i32 } %483, 0, !dbg !50 + %485 = extractvalue { i32, i32, i32, i32 } %483, 1, !dbg !50 + %486 = extractvalue { i32, i32, i32, i32 } %483, 2, !dbg !50 + %487 = extractvalue { i32, i32, i32, i32 } %483, 3, !dbg !50 + %488 = fsub float %404, %50, !dbg !51 + %489 = fsub float %405, %50, !dbg !51 + %490 = fsub float %406, %50, !dbg !51 + %491 = fsub float %407, %50, !dbg !51 + %492 = fsub float %408, %50, !dbg !51 + %493 = fsub float %409, %50, !dbg !51 + %494 = fsub float %410, %50, !dbg !51 + %495 = fsub float %411, %50, !dbg !51 + %496 = fsub float %412, %50, !dbg !51 + %497 = fsub float %413, %50, !dbg !51 + %498 = fsub float %414, %50, !dbg !51 + %499 = fsub float %415, %50, !dbg !51 + %500 = fsub float %416, %50, !dbg !51 + %501 = fsub float %417, %50, !dbg !51 + %502 = fsub float %418, %50, !dbg !51 + %503 = fsub float %419, %50, !dbg !51 + %504 = fsub float %420, %50, !dbg !51 + %505 = fsub float %421, %50, !dbg !51 + %506 = fsub float %422, %50, !dbg !51 + %507 = fsub float %423, %50, !dbg !51 + %508 = fsub float %424, %50, !dbg !51 + %509 = fsub float %425, %50, !dbg !51 + %510 = fsub float %426, %50, !dbg !51 + %511 = fsub float %427, %50, !dbg !51 + %512 = fsub float %428, %50, !dbg !51 + %513 = fsub float %429, %50, !dbg !51 + %514 = fsub float %430, %50, !dbg !51 + %515 = fsub float %431, %50, !dbg !51 + %516 = fsub float %432, %50, !dbg !51 + %517 = fsub float %433, %50, !dbg !51 + %518 = fsub float %434, %50, !dbg !51 + %519 = fsub float %435, %50, !dbg !51 + %520 = fmul float %488, 0x3FF7154760000000, !dbg !52 + %521 = tail call float @llvm.nvvm.ex2.approx.f(float %520), !dbg !52 + %522 = fmul float %489, 0x3FF7154760000000, !dbg !52 + %523 = tail call float @llvm.nvvm.ex2.approx.f(float %522), !dbg !52 + %524 = fmul float %490, 0x3FF7154760000000, !dbg !52 + %525 = tail call float @llvm.nvvm.ex2.approx.f(float %524), !dbg !52 + %526 = fmul float %491, 0x3FF7154760000000, !dbg !52 + %527 = tail call float @llvm.nvvm.ex2.approx.f(float %526), !dbg !52 + %528 = fmul float %492, 0x3FF7154760000000, !dbg !52 + %529 = tail call float @llvm.nvvm.ex2.approx.f(float %528), !dbg !52 + %530 = fmul float %493, 0x3FF7154760000000, !dbg !52 + %531 = tail call float @llvm.nvvm.ex2.approx.f(float %530), !dbg !52 + %532 = fmul float %494, 0x3FF7154760000000, !dbg !52 + %533 = tail call float @llvm.nvvm.ex2.approx.f(float %532), !dbg !52 + %534 = fmul float %495, 0x3FF7154760000000, !dbg !52 + %535 = tail call float @llvm.nvvm.ex2.approx.f(float %534), !dbg !52 + %536 = fmul float %496, 0x3FF7154760000000, !dbg !52 + %537 = tail call float @llvm.nvvm.ex2.approx.f(float %536), !dbg !52 + %538 = fmul float %497, 0x3FF7154760000000, !dbg !52 + %539 = tail call float @llvm.nvvm.ex2.approx.f(float %538), !dbg !52 + %540 = fmul float %498, 0x3FF7154760000000, !dbg !52 + %541 = tail call float @llvm.nvvm.ex2.approx.f(float %540), !dbg !52 + %542 = fmul float %499, 0x3FF7154760000000, !dbg !52 + %543 = tail call float @llvm.nvvm.ex2.approx.f(float %542), !dbg !52 + %544 = fmul float %500, 0x3FF7154760000000, !dbg !52 + %545 = tail call float @llvm.nvvm.ex2.approx.f(float %544), !dbg !52 + %546 = fmul float %501, 0x3FF7154760000000, !dbg !52 + %547 = tail call float @llvm.nvvm.ex2.approx.f(float %546), !dbg !52 + %548 = fmul float %502, 0x3FF7154760000000, !dbg !52 + %549 = tail call float @llvm.nvvm.ex2.approx.f(float %548), !dbg !52 + %550 = fmul float %503, 0x3FF7154760000000, !dbg !52 + %551 = tail call float @llvm.nvvm.ex2.approx.f(float %550), !dbg !52 + %552 = fmul float %504, 0x3FF7154760000000, !dbg !52 + %553 = tail call float @llvm.nvvm.ex2.approx.f(float %552), !dbg !52 + %554 = fmul float %505, 0x3FF7154760000000, !dbg !52 + %555 = tail call float @llvm.nvvm.ex2.approx.f(float %554), !dbg !52 + %556 = fmul float %506, 0x3FF7154760000000, !dbg !52 + %557 = tail call float @llvm.nvvm.ex2.approx.f(float %556), !dbg !52 + %558 = fmul float %507, 0x3FF7154760000000, !dbg !52 + %559 = tail call float @llvm.nvvm.ex2.approx.f(float %558), !dbg !52 + %560 = fmul float %508, 0x3FF7154760000000, !dbg !52 + %561 = tail call float @llvm.nvvm.ex2.approx.f(float %560), !dbg !52 + %562 = fmul float %509, 0x3FF7154760000000, !dbg !52 + %563 = tail call float @llvm.nvvm.ex2.approx.f(float %562), !dbg !52 + %564 = fmul float %510, 0x3FF7154760000000, !dbg !52 + %565 = tail call float @llvm.nvvm.ex2.approx.f(float %564), !dbg !52 + %566 = fmul float %511, 0x3FF7154760000000, !dbg !52 + %567 = tail call float @llvm.nvvm.ex2.approx.f(float %566), !dbg !52 + %568 = fmul float %512, 0x3FF7154760000000, !dbg !52 + %569 = tail call float @llvm.nvvm.ex2.approx.f(float %568), !dbg !52 + %570 = fmul float %513, 0x3FF7154760000000, !dbg !52 + %571 = tail call float @llvm.nvvm.ex2.approx.f(float %570), !dbg !52 + %572 = fmul float %514, 0x3FF7154760000000, !dbg !52 + %573 = tail call float @llvm.nvvm.ex2.approx.f(float %572), !dbg !52 + %574 = fmul float %515, 0x3FF7154760000000, !dbg !52 + %575 = tail call float @llvm.nvvm.ex2.approx.f(float %574), !dbg !52 + %576 = fmul float %516, 0x3FF7154760000000, !dbg !52 + %577 = tail call float @llvm.nvvm.ex2.approx.f(float %576), !dbg !52 + %578 = fmul float %517, 0x3FF7154760000000, !dbg !52 + %579 = tail call float @llvm.nvvm.ex2.approx.f(float %578), !dbg !52 + %580 = fmul float %518, 0x3FF7154760000000, !dbg !52 + %581 = tail call float @llvm.nvvm.ex2.approx.f(float %580), !dbg !52 + %582 = fmul float %519, 0x3FF7154760000000, !dbg !52 + %583 = tail call float @llvm.nvvm.ex2.approx.f(float %582), !dbg !52 + %584 = tail call float @llvm.nvvm.div.full(float %521, float %52), !dbg !53 + %585 = tail call float @llvm.nvvm.div.full(float %523, float %52), !dbg !53 + %586 = tail call float @llvm.nvvm.div.full(float %525, float %52), !dbg !53 + %587 = tail call float @llvm.nvvm.div.full(float %527, float %52), !dbg !53 + %588 = tail call float @llvm.nvvm.div.full(float %529, float %52), !dbg !53 + %589 = tail call float @llvm.nvvm.div.full(float %531, float %52), !dbg !53 + %590 = tail call float @llvm.nvvm.div.full(float %533, float %52), !dbg !53 + %591 = tail call float @llvm.nvvm.div.full(float %535, float %52), !dbg !53 + %592 = tail call float @llvm.nvvm.div.full(float %537, float %52), !dbg !53 + %593 = tail call float @llvm.nvvm.div.full(float %539, float %52), !dbg !53 + %594 = tail call float @llvm.nvvm.div.full(float %541, float %52), !dbg !53 + %595 = tail call float @llvm.nvvm.div.full(float %543, float %52), !dbg !53 + %596 = tail call float @llvm.nvvm.div.full(float %545, float %52), !dbg !53 + %597 = tail call float @llvm.nvvm.div.full(float %547, float %52), !dbg !53 + %598 = tail call float @llvm.nvvm.div.full(float %549, float %52), !dbg !53 + %599 = tail call float @llvm.nvvm.div.full(float %551, float %52), !dbg !53 + %600 = tail call float @llvm.nvvm.div.full(float %553, float %52), !dbg !53 + %601 = tail call float @llvm.nvvm.div.full(float %555, float %52), !dbg !53 + %602 = tail call float @llvm.nvvm.div.full(float %557, float %52), !dbg !53 + %603 = tail call float @llvm.nvvm.div.full(float %559, float %52), !dbg !53 + %604 = tail call float @llvm.nvvm.div.full(float %561, float %52), !dbg !53 + %605 = tail call float @llvm.nvvm.div.full(float %563, float %52), !dbg !53 + %606 = tail call float @llvm.nvvm.div.full(float %565, float %52), !dbg !53 + %607 = tail call float @llvm.nvvm.div.full(float %567, float %52), !dbg !53 + %608 = tail call float @llvm.nvvm.div.full(float %569, float %52), !dbg !53 + %609 = tail call float @llvm.nvvm.div.full(float %571, float %52), !dbg !53 + %610 = tail call float @llvm.nvvm.div.full(float %573, float %52), !dbg !53 + %611 = tail call float @llvm.nvvm.div.full(float %575, float %52), !dbg !53 + %612 = tail call float @llvm.nvvm.div.full(float %577, float %52), !dbg !53 + %613 = tail call float @llvm.nvvm.div.full(float %579, float %52), !dbg !53 + %614 = tail call float @llvm.nvvm.div.full(float %581, float %52), !dbg !53 + %615 = tail call float @llvm.nvvm.div.full(float %583, float %52), !dbg !53 + %616 = insertelement <2 x i32> poison, i32 %449, i64 0, !dbg !50 + %617 = insertelement <2 x i32> %616, i32 %450, i64 1, !dbg !50 + %618 = bitcast <2 x i32> %617 to <2 x float>, !dbg !50 + %619 = insertelement <2 x float> poison, float %584, i64 0, !dbg !54 + %620 = insertelement <2 x float> %619, float %585, i64 1, !dbg !54 + %621 = fmul <2 x float> %312, %620, !dbg !54 + %622 = fmul <2 x float> %314, %618, !dbg !55 + %623 = fsub <2 x float> %621, %622, !dbg !56 + %624 = fadd <2 x float> %623, zeroinitializer, !dbg !56 + %625 = fptrunc <2 x float> %624 to <2 x bfloat>, !dbg !57 + %626 = insertelement <2 x i32> poison, i32 %451, i64 0, !dbg !50 + %627 = insertelement <2 x i32> %626, i32 %452, i64 1, !dbg !50 + %628 = bitcast <2 x i32> %627 to <2 x float>, !dbg !50 + %629 = insertelement <2 x float> poison, float %586, i64 0, !dbg !54 + %630 = insertelement <2 x float> %629, float %587, i64 1, !dbg !54 + %631 = fmul <2 x float> %312, %630, !dbg !54 + %632 = fmul <2 x float> %314, %628, !dbg !55 + %633 = fsub <2 x float> %631, %632, !dbg !56 + %634 = fadd <2 x float> %633, zeroinitializer, !dbg !56 + %635 = fptrunc <2 x float> %634 to <2 x bfloat>, !dbg !57 + %636 = insertelement <2 x i32> poison, i32 %454, i64 0, !dbg !50 + %637 = insertelement <2 x i32> %636, i32 %455, i64 1, !dbg !50 + %638 = bitcast <2 x i32> %637 to <2 x float>, !dbg !50 + %639 = insertelement <2 x float> poison, float %588, i64 0, !dbg !54 + %640 = insertelement <2 x float> %639, float %589, i64 1, !dbg !54 + %641 = fmul <2 x float> %312, %640, !dbg !54 + %642 = fmul <2 x float> %314, %638, !dbg !55 + %643 = fsub <2 x float> %641, %642, !dbg !56 + %644 = fadd <2 x float> %643, zeroinitializer, !dbg !56 + %645 = fptrunc <2 x float> %644 to <2 x bfloat>, !dbg !57 + %646 = insertelement <2 x i32> poison, i32 %456, i64 0, !dbg !50 + %647 = insertelement <2 x i32> %646, i32 %457, i64 1, !dbg !50 + %648 = bitcast <2 x i32> %647 to <2 x float>, !dbg !50 + %649 = insertelement <2 x float> poison, float %590, i64 0, !dbg !54 + %650 = insertelement <2 x float> %649, float %591, i64 1, !dbg !54 + %651 = fmul <2 x float> %312, %650, !dbg !54 + %652 = fmul <2 x float> %314, %648, !dbg !55 + %653 = fsub <2 x float> %651, %652, !dbg !56 + %654 = fadd <2 x float> %653, zeroinitializer, !dbg !56 + %655 = fptrunc <2 x float> %654 to <2 x bfloat>, !dbg !57 + %656 = insertelement <2 x i32> poison, i32 %459, i64 0, !dbg !50 + %657 = insertelement <2 x i32> %656, i32 %460, i64 1, !dbg !50 + %658 = bitcast <2 x i32> %657 to <2 x float>, !dbg !50 + %659 = insertelement <2 x float> poison, float %592, i64 0, !dbg !54 + %660 = insertelement <2 x float> %659, float %593, i64 1, !dbg !54 + %661 = fmul <2 x float> %312, %660, !dbg !54 + %662 = fmul <2 x float> %314, %658, !dbg !55 + %663 = fsub <2 x float> %661, %662, !dbg !56 + %664 = fadd <2 x float> %663, zeroinitializer, !dbg !56 + %665 = fptrunc <2 x float> %664 to <2 x bfloat>, !dbg !57 + %666 = insertelement <2 x i32> poison, i32 %461, i64 0, !dbg !50 + %667 = insertelement <2 x i32> %666, i32 %462, i64 1, !dbg !50 + %668 = bitcast <2 x i32> %667 to <2 x float>, !dbg !50 + %669 = insertelement <2 x float> poison, float %594, i64 0, !dbg !54 + %670 = insertelement <2 x float> %669, float %595, i64 1, !dbg !54 + %671 = fmul <2 x float> %312, %670, !dbg !54 + %672 = fmul <2 x float> %314, %668, !dbg !55 + %673 = fsub <2 x float> %671, %672, !dbg !56 + %674 = fadd <2 x float> %673, zeroinitializer, !dbg !56 + %675 = fptrunc <2 x float> %674 to <2 x bfloat>, !dbg !57 + %676 = insertelement <2 x i32> poison, i32 %464, i64 0, !dbg !50 + %677 = insertelement <2 x i32> %676, i32 %465, i64 1, !dbg !50 + %678 = bitcast <2 x i32> %677 to <2 x float>, !dbg !50 + %679 = insertelement <2 x float> poison, float %596, i64 0, !dbg !54 + %680 = insertelement <2 x float> %679, float %597, i64 1, !dbg !54 + %681 = fmul <2 x float> %312, %680, !dbg !54 + %682 = fmul <2 x float> %314, %678, !dbg !55 + %683 = fsub <2 x float> %681, %682, !dbg !56 + %684 = fadd <2 x float> %683, zeroinitializer, !dbg !56 + %685 = fptrunc <2 x float> %684 to <2 x bfloat>, !dbg !57 + %686 = insertelement <2 x i32> poison, i32 %466, i64 0, !dbg !50 + %687 = insertelement <2 x i32> %686, i32 %467, i64 1, !dbg !50 + %688 = bitcast <2 x i32> %687 to <2 x float>, !dbg !50 + %689 = insertelement <2 x float> poison, float %598, i64 0, !dbg !54 + %690 = insertelement <2 x float> %689, float %599, i64 1, !dbg !54 + %691 = fmul <2 x float> %312, %690, !dbg !54 + %692 = fmul <2 x float> %314, %688, !dbg !55 + %693 = fsub <2 x float> %691, %692, !dbg !56 + %694 = fadd <2 x float> %693, zeroinitializer, !dbg !56 + %695 = fptrunc <2 x float> %694 to <2 x bfloat>, !dbg !57 + %696 = insertelement <2 x i32> poison, i32 %469, i64 0, !dbg !50 + %697 = insertelement <2 x i32> %696, i32 %470, i64 1, !dbg !50 + %698 = bitcast <2 x i32> %697 to <2 x float>, !dbg !50 + %699 = insertelement <2 x float> poison, float %600, i64 0, !dbg !54 + %700 = insertelement <2 x float> %699, float %601, i64 1, !dbg !54 + %701 = fmul <2 x float> %312, %700, !dbg !54 + %702 = fmul <2 x float> %314, %698, !dbg !55 + %703 = fsub <2 x float> %701, %702, !dbg !56 + %704 = fadd <2 x float> %703, zeroinitializer, !dbg !56 + %705 = fptrunc <2 x float> %704 to <2 x bfloat>, !dbg !57 + %706 = insertelement <2 x i32> poison, i32 %471, i64 0, !dbg !50 + %707 = insertelement <2 x i32> %706, i32 %472, i64 1, !dbg !50 + %708 = bitcast <2 x i32> %707 to <2 x float>, !dbg !50 + %709 = insertelement <2 x float> poison, float %602, i64 0, !dbg !54 + %710 = insertelement <2 x float> %709, float %603, i64 1, !dbg !54 + %711 = fmul <2 x float> %312, %710, !dbg !54 + %712 = fmul <2 x float> %314, %708, !dbg !55 + %713 = fsub <2 x float> %711, %712, !dbg !56 + %714 = fadd <2 x float> %713, zeroinitializer, !dbg !56 + %715 = fptrunc <2 x float> %714 to <2 x bfloat>, !dbg !57 + %716 = insertelement <2 x i32> poison, i32 %474, i64 0, !dbg !50 + %717 = insertelement <2 x i32> %716, i32 %475, i64 1, !dbg !50 + %718 = bitcast <2 x i32> %717 to <2 x float>, !dbg !50 + %719 = insertelement <2 x float> poison, float %604, i64 0, !dbg !54 + %720 = insertelement <2 x float> %719, float %605, i64 1, !dbg !54 + %721 = fmul <2 x float> %312, %720, !dbg !54 + %722 = fmul <2 x float> %314, %718, !dbg !55 + %723 = fsub <2 x float> %721, %722, !dbg !56 + %724 = fadd <2 x float> %723, zeroinitializer, !dbg !56 + %725 = fptrunc <2 x float> %724 to <2 x bfloat>, !dbg !57 + %726 = insertelement <2 x i32> poison, i32 %476, i64 0, !dbg !50 + %727 = insertelement <2 x i32> %726, i32 %477, i64 1, !dbg !50 + %728 = bitcast <2 x i32> %727 to <2 x float>, !dbg !50 + %729 = insertelement <2 x float> poison, float %606, i64 0, !dbg !54 + %730 = insertelement <2 x float> %729, float %607, i64 1, !dbg !54 + %731 = fmul <2 x float> %312, %730, !dbg !54 + %732 = fmul <2 x float> %314, %728, !dbg !55 + %733 = fsub <2 x float> %731, %732, !dbg !56 + %734 = fadd <2 x float> %733, zeroinitializer, !dbg !56 + %735 = fptrunc <2 x float> %734 to <2 x bfloat>, !dbg !57 + %736 = insertelement <2 x i32> poison, i32 %479, i64 0, !dbg !50 + %737 = insertelement <2 x i32> %736, i32 %480, i64 1, !dbg !50 + %738 = bitcast <2 x i32> %737 to <2 x float>, !dbg !50 + %739 = insertelement <2 x float> poison, float %608, i64 0, !dbg !54 + %740 = insertelement <2 x float> %739, float %609, i64 1, !dbg !54 + %741 = fmul <2 x float> %312, %740, !dbg !54 + %742 = fmul <2 x float> %314, %738, !dbg !55 + %743 = fsub <2 x float> %741, %742, !dbg !56 + %744 = fadd <2 x float> %743, zeroinitializer, !dbg !56 + %745 = fptrunc <2 x float> %744 to <2 x bfloat>, !dbg !57 + %746 = insertelement <2 x i32> poison, i32 %481, i64 0, !dbg !50 + %747 = insertelement <2 x i32> %746, i32 %482, i64 1, !dbg !50 + %748 = bitcast <2 x i32> %747 to <2 x float>, !dbg !50 + %749 = insertelement <2 x float> poison, float %610, i64 0, !dbg !54 + %750 = insertelement <2 x float> %749, float %611, i64 1, !dbg !54 + %751 = fmul <2 x float> %312, %750, !dbg !54 + %752 = fmul <2 x float> %314, %748, !dbg !55 + %753 = fsub <2 x float> %751, %752, !dbg !56 + %754 = fadd <2 x float> %753, zeroinitializer, !dbg !56 + %755 = fptrunc <2 x float> %754 to <2 x bfloat>, !dbg !57 + %756 = insertelement <2 x i32> poison, i32 %484, i64 0, !dbg !50 + %757 = insertelement <2 x i32> %756, i32 %485, i64 1, !dbg !50 + %758 = bitcast <2 x i32> %757 to <2 x float>, !dbg !50 + %759 = insertelement <2 x float> poison, float %612, i64 0, !dbg !54 + %760 = insertelement <2 x float> %759, float %613, i64 1, !dbg !54 + %761 = fmul <2 x float> %312, %760, !dbg !54 + %762 = fmul <2 x float> %314, %758, !dbg !55 + %763 = fsub <2 x float> %761, %762, !dbg !56 + %764 = fadd <2 x float> %763, zeroinitializer, !dbg !56 + %765 = fptrunc <2 x float> %764 to <2 x bfloat>, !dbg !57 + %766 = insertelement <2 x i32> poison, i32 %486, i64 0, !dbg !50 + %767 = insertelement <2 x i32> %766, i32 %487, i64 1, !dbg !50 + %768 = bitcast <2 x i32> %767 to <2 x float>, !dbg !50 + %769 = insertelement <2 x float> poison, float %614, i64 0, !dbg !54 + %770 = insertelement <2 x float> %769, float %615, i64 1, !dbg !54 + %771 = fmul <2 x float> %312, %770, !dbg !54 + %772 = fmul <2 x float> %314, %768, !dbg !55 + %773 = fsub <2 x float> %771, %772, !dbg !56 + %774 = fadd <2 x float> %773, zeroinitializer, !dbg !56 + %775 = fptrunc <2 x float> %774 to <2 x bfloat>, !dbg !57 + %776 = bitcast <2 x bfloat> %625 to i32, !dbg !57 + %777 = bitcast <2 x bfloat> %635 to i32, !dbg !57 + %778 = bitcast <2 x bfloat> %645 to i32, !dbg !57 + %779 = bitcast <2 x bfloat> %655 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %776, i32 %777, i32 %778, i32 %779, ptr addrspace(1) %329, i1 %324) #5, !dbg !57 + %780 = bitcast <2 x bfloat> %665 to i32, !dbg !57 + %781 = bitcast <2 x bfloat> %675 to i32, !dbg !57 + %782 = bitcast <2 x bfloat> %685 to i32, !dbg !57 + %783 = bitcast <2 x bfloat> %695 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %780, i32 %781, i32 %782, i32 %783, ptr addrspace(1) %331, i1 %325) #5, !dbg !57 + %784 = bitcast <2 x bfloat> %705 to i32, !dbg !57 + %785 = bitcast <2 x bfloat> %715 to i32, !dbg !57 + %786 = bitcast <2 x bfloat> %725 to i32, !dbg !57 + %787 = bitcast <2 x bfloat> %735 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %784, i32 %785, i32 %786, i32 %787, ptr addrspace(1) %333, i1 %326) #5, !dbg !57 + %788 = bitcast <2 x bfloat> %745 to i32, !dbg !57 + %789 = bitcast <2 x bfloat> %755 to i32, !dbg !57 + %790 = bitcast <2 x bfloat> %765 to i32, !dbg !57 + %791 = bitcast <2 x bfloat> %775 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %788, i32 %789, i32 %790, i32 %791, ptr addrspace(1) %335, i1 %327) #5, !dbg !57 + %792 = add i32 %315, 32768, !dbg !43 + %793 = icmp slt i32 %792, %9, !dbg !43 + br i1 %793, label %.lr.ph5, label %.loopexit, !dbg !43 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="1024" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "loss.py", directory: "/workspace/hanrui/SpecForge-ext/specforge/core") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "log_softmax_backward_kernel", linkageName: "log_softmax_backward_kernel", scope: !1, file: !1, line: 114, type: !5, scopeLine: 114, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 127, column: 31, scope: !4) +!8 = !DILocation(line: 127, column: 37, scope: !4) +!9 = !DILocation(line: 128, column: 31, scope: !4) +!10 = !DILocation(line: 128, column: 18, scope: !4) +!11 = !DILocation(line: 129, column: 31, scope: !4) +!12 = !DILocation(line: 129, column: 18, scope: !4) +!13 = !DILocation(line: 130, column: 25, scope: !4) +!14 = !DILocation(line: 132, column: 28, scope: !4) +!15 = !DILocation(line: 133, column: 24, scope: !4) +!16 = !DILocation(line: 135, column: 39, scope: !4) +!17 = !DILocation(line: 134, column: 34, scope: !4) +!18 = !DILocation(line: 135, column: 26, scope: !4) +!19 = !DILocation(line: 136, column: 29, scope: !4) +!20 = !DILocation(line: 137, column: 34, scope: !4) +!21 = !DILocation(line: 137, column: 43, scope: !4) +!22 = !DILocation(line: 138, column: 8, scope: !4) +!23 = !DILocation(line: 140, column: 13, scope: !4) +!24 = !DILocation(line: 141, column: 13, scope: !4) +!25 = !DILocation(line: 142, column: 16, scope: !4) +!26 = !DILocation(line: 143, column: 16, scope: !4) +!27 = !DILocation(line: 144, column: 26, scope: !4) +!28 = !DILocation(line: 145, column: 32, scope: !4) +!29 = !DILocation(line: 150, column: 35, scope: !4) +!30 = !DILocation(line: 149, column: 30, scope: !4) +!31 = !DILocation(line: 150, column: 22, scope: !4) +!32 = !DILocation(line: 151, column: 25, scope: !4) +!33 = !DILocation(line: 152, column: 44, scope: !4) +!34 = !DILocation(line: 152, column: 31, scope: !4) +!35 = !DILocation(line: 155, column: 64, scope: !4) +!36 = !DILocation(line: 155, column: 77, scope: !4) +!37 = !DILocation(line: 291, column: 36, scope: !38, inlinedAt: !40) +!38 = distinct !DILexicalBlockFile(scope: !4, file: !39, discriminator: 0) +!39 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!40 = !DILocation(line: 155, column: 34, scope: !4) +!41 = !DILocation(line: 261, column: 15, scope: !38, inlinedAt: !40) +!42 = !DILocation(line: 155, column: 27, scope: !4) +!43 = !DILocation(line: 158, column: 30, scope: !4) +!44 = !DILocation(line: 159, column: 22, scope: !4) +!45 = !DILocation(line: 160, column: 25, scope: !4) +!46 = !DILocation(line: 161, column: 44, scope: !4) +!47 = !DILocation(line: 161, column: 31, scope: !4) +!48 = !DILocation(line: 162, column: 12, scope: !4) +!49 = !DILocation(line: 164, column: 44, scope: !4) +!50 = !DILocation(line: 164, column: 31, scope: !4) +!51 = !DILocation(line: 167, column: 45, scope: !4) +!52 = !DILocation(line: 167, column: 30, scope: !4) +!53 = !DILocation(line: 167, column: 50, scope: !4) +!54 = !DILocation(line: 168, column: 41, scope: !4) +!55 = !DILocation(line: 169, column: 38, scope: !4) +!56 = !DILocation(line: 169, column: 23, scope: !4) +!57 = !DILocation(line: 170, column: 39, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx new file mode 100644 index 0000000000000000000000000000000000000000..a901c983491862323e4953ef991691b692d488a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx @@ -0,0 +1,1182 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl log_softmax_backward_kernel // -- Begin function log_softmax_backward_kernel +.extern .shared .align 16 .b8 global_smem[]; + // @log_softmax_backward_kernel +.visible .entry log_softmax_backward_kernel( + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_0, + .param .u32 log_softmax_backward_kernel_param_1, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_2, + .param .u32 log_softmax_backward_kernel_param_3, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_4, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_5, + .param .f32 log_softmax_backward_kernel_param_6, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_7, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_8, + .param .u32 log_softmax_backward_kernel_param_9, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_10, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_11 +) +.reqntid 1024 +{ + .reg .pred %p<38>; + .reg .b16 %rs<35>; + .reg .b32 %r<671>; + .reg .b64 %rd<126>; + .loc 1 114 0 // loss.py:114:0 +$L__func_begin0: + .loc 1 114 0 // loss.py:114:0 + +// %bb.0: + ld.param.b32 %r19, [log_softmax_backward_kernel_param_9]; + ld.param.b64 %rd11, [log_softmax_backward_kernel_param_0]; + ld.param.s32 %rd12, [log_softmax_backward_kernel_param_1]; +$L__tmp0: + .loc 1 127 31 // loss.py:127:31 + mov.u32 %r20, %ctaid.x; + .loc 1 127 37 // loss.py:127:37 + cvt.u64.u32 %rd1, %r20; + .loc 1 128 31 // loss.py:128:31 + mul.lo.s64 %rd15, %rd12, %rd1; + ld.param.b64 %rd16, [log_softmax_backward_kernel_param_4]; + .loc 1 128 18 // loss.py:128:18 + shl.b64 %rd17, %rd15, 1; + add.s64 %rd2, %rd11, %rd17; + .loc 1 130 25 // loss.py:130:25 + add.s64 %rd10, %rd16, %rd1; + .loc 1 132 28 // loss.py:132:28 + // begin inline asm + mov.u16 %rs1, 0x0; + ld.global.b8 { %rs1 }, [ %rd10 + 0 ]; + // end inline asm + and.b16 %rs2, %rs1, 255; + setp.ne.b16 %p1, %rs2, 0; + .loc 1 133 24 // loss.py:133:24 + @%p1 bra $L__BB0_5; + bra.uni $L__BB0_1; +$L__BB0_5: + .loc 1 0 24 // loss.py:0:24 + ld.param.b64 %rd9, [log_softmax_backward_kernel_param_8]; + ld.param.b64 %rd8, [log_softmax_backward_kernel_param_7]; + ld.param.b64 %rd7, [log_softmax_backward_kernel_param_5]; + .loc 1 140 13 // loss.py:140:13 + shl.b64 %rd23, %rd1, 2; + add.s64 %rd20, %rd8, %rd23; + .loc 1 141 13 // loss.py:141:13 + add.s64 %rd21, %rd9, %rd23; + .loc 1 142 16 // loss.py:142:16 + // begin inline asm + mov.u32 %r21, 0x0; + ld.global.b32 { %r21 }, [ %rd20 + 0 ]; + // end inline asm + .loc 1 143 16 // loss.py:143:16 + // begin inline asm + mov.u32 %r22, 0x0; + ld.global.b32 { %r22 }, [ %rd21 + 0 ]; + // end inline asm + .loc 1 144 26 // loss.py:144:26 + // begin inline asm + mov.u32 %r23, 0x0; + ld.global.b32 { %r23 }, [ %rd7 + 0 ]; + // end inline asm + .loc 1 149 30 // loss.py:149:30 + setp.lt.s32 %p2, %r19, 1; + @%p2 bra $L__BB0_4; +// %bb.6: // %.lr.ph + .loc 1 0 30 // loss.py:0:30 + ld.param.b32 %r18, [log_softmax_backward_kernel_param_6]; + ld.param.b64 %rd13, [log_softmax_backward_kernel_param_2]; + ld.param.s32 %rd14, [log_softmax_backward_kernel_param_3]; + mul.lo.s64 %rd18, %rd14, %rd1; + shl.b64 %rd19, %rd18, 2; + add.s64 %rd3, %rd13, %rd19; + mul.f32 %r6, %r18, %r23; + .loc 1 150 35 // loss.py:150:35 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r7, 31; + shr.u32 %r26, %r7, 3; + and.b32 %r27, %r26, 124; + mov.b32 %r28, global_smem; + add.s32 %r94, %r28, %r27; + shl.b32 %r29, %r7, 2; + add.s32 %r97, %r28, %r29; + mov.b32 %r34, 0; + mov.b32 %r668, 0f00000000; + setp.eq.b32 %p13, %r7, 0; + setp.lt.u32 %p12, %r7, 32; + setp.eq.b32 %p11, %r9, 0; + cvt.u64.u32 %rd4, %r8; + mov.b32 %r669, %r34; +$L__BB0_7: // =>This Inner Loop Header: Depth=1 + .loc 1 150 22 // loss.py:150:22 + add.s32 %r100, %r8, %r669; + add.s32 %r101, %r100, 8192; + add.s32 %r102, %r100, 16384; + .loc 1 151 25 // loss.py:151:25 + add.s32 %r103, %r100, 24576; + setp.lt.s32 %p3, %r100, %r19; + setp.lt.s32 %p5, %r101, %r19; + setp.lt.s32 %p7, %r102, %r19; + setp.lt.s32 %p9, %r103, %r19; + .loc 1 152 44 // loss.py:152:44 + mad.wide.s32 %rd24, %r100, 4, %rd3; + cvt.s64.s32 %rd32, %r669; + add.s64 %rd33, %rd32, %rd4; + shl.b64 %rd34, %rd33, 2; + add.s64 %rd35, %rd3, %rd34; + add.s64 %rd25, %rd35, 16; + add.s64 %rd26, %rd35, 32768; + add.s64 %rd27, %rd35, 32784; + add.s64 %rd28, %rd35, 65536; + add.s64 %rd29, %rd35, 65552; + add.s64 %rd30, %rd35, 98304; + add.s64 %rd31, %rd35, 98320; + .loc 1 152 31 // loss.py:152:31 + // begin inline asm + mov.u32 %r30, %r34; + mov.u32 %r31, %r34; + mov.u32 %r32, %r34; + mov.u32 %r33, %r34; + @%p3 ld.global.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd24 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r38, %r34; + mov.u32 %r39, %r34; + mov.u32 %r40, %r34; + mov.u32 %r41, %r34; + @%p3 ld.global.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd25 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r46, %r34; + mov.u32 %r47, %r34; + mov.u32 %r48, %r34; + mov.u32 %r49, %r34; + @%p5 ld.global.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd26 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r54, %r34; + mov.u32 %r55, %r34; + mov.u32 %r56, %r34; + mov.u32 %r57, %r34; + @%p5 ld.global.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd27 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r62, %r34; + mov.u32 %r63, %r34; + mov.u32 %r64, %r34; + mov.u32 %r65, %r34; + @%p7 ld.global.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd28 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r70, %r34; + mov.u32 %r71, %r34; + mov.u32 %r72, %r34; + mov.u32 %r73, %r34; + @%p7 ld.global.v4.b32 { %r70, %r71, %r72, %r73 }, [ %rd29 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r78, %r34; + mov.u32 %r79, %r34; + mov.u32 %r80, %r34; + mov.u32 %r81, %r34; + @%p9 ld.global.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd30 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r86, %r34; + mov.u32 %r87, %r34; + mov.u32 %r88, %r34; + mov.u32 %r89, %r34; + @%p9 ld.global.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd31 + 0 ]; + // end inline asm + .loc 1 155 64 // loss.py:155:64 + mul.f32 %r104, %r6, %r31; + mul.f32 %r105, %r6, %r46; + mul.f32 %r106, %r6, %r47; + mul.f32 %r107, %r6, %r48; + mul.f32 %r108, %r6, %r49; + mul.f32 %r109, %r6, %r54; + mul.f32 %r110, %r6, %r55; + mul.f32 %r111, %r6, %r56; + mul.f32 %r112, %r6, %r57; + mul.f32 %r113, %r6, %r62; + mul.f32 %r114, %r6, %r63; + mul.f32 %r115, %r6, %r64; + mul.f32 %r116, %r6, %r65; + mul.f32 %r117, %r6, %r70; + mul.f32 %r118, %r6, %r71; + mul.f32 %r119, %r6, %r72; + mul.f32 %r120, %r6, %r73; + mul.f32 %r121, %r6, %r78; + mul.f32 %r122, %r6, %r79; + mul.f32 %r123, %r6, %r80; + mul.f32 %r124, %r6, %r81; + mul.f32 %r125, %r6, %r86; + mul.f32 %r126, %r6, %r87; + mul.f32 %r127, %r6, %r88; + mul.f32 %r128, %r6, %r89; + .loc 1 155 77 // loss.py:155:77 + selp.f32 %r129, %r105, 0f00000000, %p5; + selp.f32 %r130, %r106, 0f00000000, %p5; + selp.f32 %r131, %r107, 0f00000000, %p5; + selp.f32 %r132, %r108, 0f00000000, %p5; + selp.f32 %r133, %r109, 0f00000000, %p5; + selp.f32 %r134, %r110, 0f00000000, %p5; + selp.f32 %r135, %r111, 0f00000000, %p5; + selp.f32 %r136, %r112, 0f00000000, %p5; + selp.f32 %r137, %r113, 0f00000000, %p7; + selp.f32 %r138, %r114, 0f00000000, %p7; + selp.f32 %r139, %r115, 0f00000000, %p7; + selp.f32 %r140, %r116, 0f00000000, %p7; + selp.f32 %r141, %r117, 0f00000000, %p7; + selp.f32 %r142, %r118, 0f00000000, %p7; + selp.f32 %r143, %r119, 0f00000000, %p7; + selp.f32 %r144, %r120, 0f00000000, %p7; + selp.f32 %r145, %r121, 0f00000000, %p9; + selp.f32 %r146, %r122, 0f00000000, %p9; + selp.f32 %r147, %r123, 0f00000000, %p9; + selp.f32 %r148, %r124, 0f00000000, %p9; + selp.f32 %r149, %r125, 0f00000000, %p9; + selp.f32 %r150, %r126, 0f00000000, %p9; + selp.f32 %r151, %r127, 0f00000000, %p9; + selp.f32 %r152, %r128, 0f00000000, %p9; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + fma.rn.f32 %r153, %r6, %r30, %r104; + fma.rn.f32 %r154, %r6, %r32, %r153; + fma.rn.f32 %r155, %r6, %r33, %r154; + fma.rn.f32 %r156, %r6, %r38, %r155; + fma.rn.f32 %r157, %r6, %r39, %r156; + fma.rn.f32 %r158, %r6, %r40, %r157; + fma.rn.f32 %r159, %r6, %r41, %r158; + selp.f32 %r160, %r159, 0f00000000, %p3; + add.f32 %r161, %r129, %r160; + add.f32 %r162, %r130, %r161; + add.f32 %r163, %r131, %r162; + add.f32 %r164, %r132, %r163; + add.f32 %r165, %r133, %r164; + add.f32 %r166, %r134, %r165; + add.f32 %r167, %r135, %r166; + add.f32 %r168, %r136, %r167; + add.f32 %r169, %r137, %r168; + add.f32 %r170, %r138, %r169; + add.f32 %r171, %r139, %r170; + add.f32 %r172, %r140, %r171; + add.f32 %r173, %r141, %r172; + add.f32 %r174, %r142, %r173; + add.f32 %r175, %r143, %r174; + add.f32 %r176, %r144, %r175; + add.f32 %r177, %r145, %r176; + add.f32 %r178, %r146, %r177; + add.f32 %r179, %r147, %r178; + add.f32 %r180, %r148, %r179; + add.f32 %r181, %r149, %r180; + add.f32 %r182, %r150, %r181; + add.f32 %r183, %r151, %r182; + add.f32 %r184, %r152, %r183; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r185, %r184, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r186, %r184, %r185; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r187, %r186, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r188, %r186, %r187; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r189, %r188, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r190, %r188, %r189; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r191, %r190, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r192, %r190, %r191; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r193, %r192, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r95, %r192, %r193; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + // begin inline asm + @%p11 st.shared.b32 [ %r94 + 0 ], %r95; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r96, [ %r97 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r194, %r96, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r195, %r96, %r194; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r196, %r195, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r197, %r195, %r196; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r198, %r197, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r199, %r197, %r198; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r200, %r199, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r201, %r199, %r200; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r202, %r201, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r99, %r201, %r202; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + // begin inline asm + @%p13 st.shared.b32 [ %r97 + 0 ], %r99; + // end inline asm + bar.sync 0; + ld.shared.b32 %r203, [global_smem]; +$L__tmp2: + .loc 1 155 27 // loss.py:155:27 + add.f32 %r668, %r668, %r203; + .loc 1 149 30 // loss.py:149:30 + add.s32 %r669, %r669, 32768; + setp.lt.s32 %p14, %r669, %r19; + @%p14 bra $L__BB0_7; +// %bb.8: // %.lr.ph5.preheader + .loc 1 0 30 // loss.py:0:30 + mov.b64 %rd5, {%r668, %r668}; + mov.b64 %rd6, {%r6, %r6}; + mov.b32 %r209, 0; + mov.b32 %r670, %r209; +$L__BB0_9: // %.lr.ph5 + // =>This Inner Loop Header: Depth=1 + .loc 1 159 22 // loss.py:159:22 + add.s32 %r317, %r8, %r670; + add.s32 %r318, %r317, 8192; + add.s32 %r319, %r317, 16384; + .loc 1 160 25 // loss.py:160:25 + add.s32 %r320, %r317, 24576; + setp.lt.s32 %p15, %r317, %r19; + setp.lt.s32 %p16, %r318, %r19; + setp.lt.s32 %p17, %r319, %r19; + setp.lt.s32 %p18, %r320, %r19; + .loc 1 161 44 // loss.py:161:44 + mad.wide.s32 %rd36, %r317, 2, %rd2; + cvt.s64.s32 %rd52, %r670; + add.s64 %rd53, %rd52, %rd4; + shl.b64 %rd54, %rd53, 1; + add.s64 %rd55, %rd2, %rd54; + add.s64 %rd37, %rd55, 16384; + add.s64 %rd38, %rd55, 32768; + add.s64 %rd39, %rd55, 49152; + .loc 1 161 31 // loss.py:161:31 + // begin inline asm + mov.u32 %r205, %r209; + mov.u32 %r206, %r209; + mov.u32 %r207, %r209; + mov.u32 %r208, %r209; + @%p15 ld.global.v4.b32 { %r205, %r206, %r207, %r208 }, [ %rd36 + 0 ]; + // end inline asm + mov.b32 {%rs3, %rs4}, %r205; + mov.b32 {%rs5, %rs6}, %r206; + mov.b32 {%rs7, %rs8}, %r207; + mov.b32 {%rs9, %rs10}, %r208; + // begin inline asm + mov.u32 %r213, %r209; + mov.u32 %r214, %r209; + mov.u32 %r215, %r209; + mov.u32 %r216, %r209; + @%p16 ld.global.v4.b32 { %r213, %r214, %r215, %r216 }, [ %rd37 + 0 ]; + // end inline asm + mov.b32 {%rs11, %rs12}, %r213; + mov.b32 {%rs13, %rs14}, %r214; + mov.b32 {%rs15, %rs16}, %r215; + mov.b32 {%rs17, %rs18}, %r216; + // begin inline asm + mov.u32 %r221, %r209; + mov.u32 %r222, %r209; + mov.u32 %r223, %r209; + mov.u32 %r224, %r209; + @%p17 ld.global.v4.b32 { %r221, %r222, %r223, %r224 }, [ %rd38 + 0 ]; + // end inline asm + mov.b32 {%rs19, %rs20}, %r221; + mov.b32 {%rs21, %rs22}, %r222; + mov.b32 {%rs23, %rs24}, %r223; + mov.b32 {%rs25, %rs26}, %r224; + // begin inline asm + mov.u32 %r229, %r209; + mov.u32 %r230, %r209; + mov.u32 %r231, %r209; + mov.u32 %r232, %r209; + @%p18 ld.global.v4.b32 { %r229, %r230, %r231, %r232 }, [ %rd39 + 0 ]; + // end inline asm + mov.b32 {%rs27, %rs28}, %r229; + mov.b32 {%rs29, %rs30}, %r230; + mov.b32 {%rs31, %rs32}, %r231; + mov.b32 {%rs33, %rs34}, %r232; + .loc 1 162 12 // loss.py:162:12 + cvt.f32.bf16 %r321, %rs3; + cvt.f32.bf16 %r322, %rs4; + cvt.f32.bf16 %r323, %rs5; + cvt.f32.bf16 %r324, %rs6; + cvt.f32.bf16 %r325, %rs7; + cvt.f32.bf16 %r326, %rs8; + cvt.f32.bf16 %r327, %rs9; + cvt.f32.bf16 %r328, %rs10; + cvt.f32.bf16 %r329, %rs11; + cvt.f32.bf16 %r330, %rs12; + cvt.f32.bf16 %r331, %rs13; + cvt.f32.bf16 %r332, %rs14; + cvt.f32.bf16 %r333, %rs15; + cvt.f32.bf16 %r334, %rs16; + cvt.f32.bf16 %r335, %rs17; + cvt.f32.bf16 %r336, %rs18; + cvt.f32.bf16 %r337, %rs19; + cvt.f32.bf16 %r338, %rs20; + cvt.f32.bf16 %r339, %rs21; + cvt.f32.bf16 %r340, %rs22; + cvt.f32.bf16 %r341, %rs23; + cvt.f32.bf16 %r342, %rs24; + cvt.f32.bf16 %r343, %rs25; + cvt.f32.bf16 %r344, %rs26; + cvt.f32.bf16 %r345, %rs27; + cvt.f32.bf16 %r346, %rs28; + cvt.f32.bf16 %r347, %rs29; + cvt.f32.bf16 %r348, %rs30; + cvt.f32.bf16 %r349, %rs31; + cvt.f32.bf16 %r350, %rs32; + cvt.f32.bf16 %r351, %rs33; + cvt.f32.bf16 %r352, %rs34; + .loc 1 164 44 // loss.py:164:44 + mad.wide.s32 %rd40, %r317, 4, %rd3; + shl.b64 %rd56, %rd53, 2; + add.s64 %rd57, %rd3, %rd56; + add.s64 %rd41, %rd57, 16; + add.s64 %rd42, %rd57, 32768; + add.s64 %rd43, %rd57, 32784; + add.s64 %rd44, %rd57, 65536; + add.s64 %rd45, %rd57, 65552; + add.s64 %rd46, %rd57, 98304; + add.s64 %rd47, %rd57, 98320; + .loc 1 164 31 // loss.py:164:31 + // begin inline asm + mov.u32 %r237, %r209; + mov.u32 %r238, %r209; + mov.u32 %r239, %r209; + mov.u32 %r240, %r209; + @%p15 ld.global.v4.b32 { %r237, %r238, %r239, %r240 }, [ %rd40 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r245, %r209; + mov.u32 %r246, %r209; + mov.u32 %r247, %r209; + mov.u32 %r248, %r209; + @%p15 ld.global.v4.b32 { %r245, %r246, %r247, %r248 }, [ %rd41 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r253, %r209; + mov.u32 %r254, %r209; + mov.u32 %r255, %r209; + mov.u32 %r256, %r209; + @%p16 ld.global.v4.b32 { %r253, %r254, %r255, %r256 }, [ %rd42 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r261, %r209; + mov.u32 %r262, %r209; + mov.u32 %r263, %r209; + mov.u32 %r264, %r209; + @%p16 ld.global.v4.b32 { %r261, %r262, %r263, %r264 }, [ %rd43 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r269, %r209; + mov.u32 %r270, %r209; + mov.u32 %r271, %r209; + mov.u32 %r272, %r209; + @%p17 ld.global.v4.b32 { %r269, %r270, %r271, %r272 }, [ %rd44 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r277, %r209; + mov.u32 %r278, %r209; + mov.u32 %r279, %r209; + mov.u32 %r280, %r209; + @%p17 ld.global.v4.b32 { %r277, %r278, %r279, %r280 }, [ %rd45 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r285, %r209; + mov.u32 %r286, %r209; + mov.u32 %r287, %r209; + mov.u32 %r288, %r209; + @%p18 ld.global.v4.b32 { %r285, %r286, %r287, %r288 }, [ %rd46 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r293, %r209; + mov.u32 %r294, %r209; + mov.u32 %r295, %r209; + mov.u32 %r296, %r209; + @%p18 ld.global.v4.b32 { %r293, %r294, %r295, %r296 }, [ %rd47 + 0 ]; + // end inline asm + .loc 1 167 45 // loss.py:167:45 + sub.f32 %r353, %r321, %r21; + sub.f32 %r354, %r322, %r21; + sub.f32 %r355, %r323, %r21; + sub.f32 %r356, %r324, %r21; + sub.f32 %r357, %r325, %r21; + sub.f32 %r358, %r326, %r21; + sub.f32 %r359, %r327, %r21; + sub.f32 %r360, %r328, %r21; + sub.f32 %r361, %r329, %r21; + sub.f32 %r362, %r330, %r21; + sub.f32 %r363, %r331, %r21; + sub.f32 %r364, %r332, %r21; + sub.f32 %r365, %r333, %r21; + sub.f32 %r366, %r334, %r21; + sub.f32 %r367, %r335, %r21; + sub.f32 %r368, %r336, %r21; + sub.f32 %r369, %r337, %r21; + sub.f32 %r370, %r338, %r21; + sub.f32 %r371, %r339, %r21; + sub.f32 %r372, %r340, %r21; + sub.f32 %r373, %r341, %r21; + sub.f32 %r374, %r342, %r21; + sub.f32 %r375, %r343, %r21; + sub.f32 %r376, %r344, %r21; + sub.f32 %r377, %r345, %r21; + sub.f32 %r378, %r346, %r21; + sub.f32 %r379, %r347, %r21; + sub.f32 %r380, %r348, %r21; + sub.f32 %r381, %r349, %r21; + sub.f32 %r382, %r350, %r21; + sub.f32 %r383, %r351, %r21; + sub.f32 %r384, %r352, %r21; + .loc 1 167 30 // loss.py:167:30 + mul.f32 %r385, %r353, 0f3FB8AA3B; + ex2.approx.f32 %r386, %r385; + mul.f32 %r387, %r354, 0f3FB8AA3B; + ex2.approx.f32 %r388, %r387; + mul.f32 %r389, %r355, 0f3FB8AA3B; + ex2.approx.f32 %r390, %r389; + mul.f32 %r391, %r356, 0f3FB8AA3B; + ex2.approx.f32 %r392, %r391; + mul.f32 %r393, %r357, 0f3FB8AA3B; + ex2.approx.f32 %r394, %r393; + mul.f32 %r395, %r358, 0f3FB8AA3B; + ex2.approx.f32 %r396, %r395; + mul.f32 %r397, %r359, 0f3FB8AA3B; + ex2.approx.f32 %r398, %r397; + mul.f32 %r399, %r360, 0f3FB8AA3B; + ex2.approx.f32 %r400, %r399; + mul.f32 %r401, %r361, 0f3FB8AA3B; + ex2.approx.f32 %r402, %r401; + mul.f32 %r403, %r362, 0f3FB8AA3B; + ex2.approx.f32 %r404, %r403; + mul.f32 %r405, %r363, 0f3FB8AA3B; + ex2.approx.f32 %r406, %r405; + mul.f32 %r407, %r364, 0f3FB8AA3B; + ex2.approx.f32 %r408, %r407; + mul.f32 %r409, %r365, 0f3FB8AA3B; + ex2.approx.f32 %r410, %r409; + mul.f32 %r411, %r366, 0f3FB8AA3B; + ex2.approx.f32 %r412, %r411; + mul.f32 %r413, %r367, 0f3FB8AA3B; + ex2.approx.f32 %r414, %r413; + mul.f32 %r415, %r368, 0f3FB8AA3B; + ex2.approx.f32 %r416, %r415; + mul.f32 %r417, %r369, 0f3FB8AA3B; + ex2.approx.f32 %r418, %r417; + mul.f32 %r419, %r370, 0f3FB8AA3B; + ex2.approx.f32 %r420, %r419; + mul.f32 %r421, %r371, 0f3FB8AA3B; + ex2.approx.f32 %r422, %r421; + mul.f32 %r423, %r372, 0f3FB8AA3B; + ex2.approx.f32 %r424, %r423; + mul.f32 %r425, %r373, 0f3FB8AA3B; + ex2.approx.f32 %r426, %r425; + mul.f32 %r427, %r374, 0f3FB8AA3B; + ex2.approx.f32 %r428, %r427; + mul.f32 %r429, %r375, 0f3FB8AA3B; + ex2.approx.f32 %r430, %r429; + mul.f32 %r431, %r376, 0f3FB8AA3B; + ex2.approx.f32 %r432, %r431; + mul.f32 %r433, %r377, 0f3FB8AA3B; + ex2.approx.f32 %r434, %r433; + mul.f32 %r435, %r378, 0f3FB8AA3B; + ex2.approx.f32 %r436, %r435; + mul.f32 %r437, %r379, 0f3FB8AA3B; + ex2.approx.f32 %r438, %r437; + mul.f32 %r439, %r380, 0f3FB8AA3B; + ex2.approx.f32 %r440, %r439; + mul.f32 %r441, %r381, 0f3FB8AA3B; + ex2.approx.f32 %r442, %r441; + mul.f32 %r443, %r382, 0f3FB8AA3B; + ex2.approx.f32 %r444, %r443; + mul.f32 %r445, %r383, 0f3FB8AA3B; + ex2.approx.f32 %r446, %r445; + mul.f32 %r447, %r384, 0f3FB8AA3B; + ex2.approx.f32 %r448, %r447; + .loc 1 167 50 // loss.py:167:50 + div.full.f32 %r449, %r386, %r22; + div.full.f32 %r450, %r388, %r22; + div.full.f32 %r451, %r390, %r22; + div.full.f32 %r452, %r392, %r22; + div.full.f32 %r453, %r394, %r22; + div.full.f32 %r454, %r396, %r22; + div.full.f32 %r455, %r398, %r22; + div.full.f32 %r456, %r400, %r22; + div.full.f32 %r457, %r402, %r22; + div.full.f32 %r458, %r404, %r22; + div.full.f32 %r459, %r406, %r22; + div.full.f32 %r460, %r408, %r22; + div.full.f32 %r461, %r410, %r22; + div.full.f32 %r462, %r412, %r22; + div.full.f32 %r463, %r414, %r22; + div.full.f32 %r464, %r416, %r22; + div.full.f32 %r465, %r418, %r22; + div.full.f32 %r466, %r420, %r22; + div.full.f32 %r467, %r422, %r22; + div.full.f32 %r468, %r424, %r22; + div.full.f32 %r469, %r426, %r22; + div.full.f32 %r470, %r428, %r22; + div.full.f32 %r471, %r430, %r22; + div.full.f32 %r472, %r432, %r22; + div.full.f32 %r473, %r434, %r22; + div.full.f32 %r474, %r436, %r22; + div.full.f32 %r475, %r438, %r22; + div.full.f32 %r476, %r440, %r22; + div.full.f32 %r477, %r442, %r22; + div.full.f32 %r478, %r444, %r22; + div.full.f32 %r479, %r446, %r22; + div.full.f32 %r480, %r448, %r22; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd58, %r237; + cvt.u64.u32 %rd59, %r238; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r481, %r482}, %rd61; + mov.b64 {%r483, %r484}, %rd6; + mul.f32 %r485, %r483, %r481; + mul.f32 %r486, %r484, %r482; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r487, %r486; + mov.b64 {%r488, %r489}, %rd5; + fma.rn.f32 %r490, %r489, %r450, %r487; + neg.f32 %r491, %r485; + fma.rn.f32 %r492, %r488, %r449, %r491; + add.f32 %r493, %r492, 0f00000000; + add.f32 %r494, %r490, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r301, %r494, %r493; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd62, %r239; + cvt.u64.u32 %rd63, %r240; + shl.b64 %rd64, %rd63, 32; + or.b64 %rd65, %rd62, %rd64; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r495, %r496}, %rd65; + mul.f32 %r497, %r483, %r495; + mul.f32 %r498, %r484, %r496; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r499, %r498; + fma.rn.f32 %r500, %r489, %r452, %r499; + neg.f32 %r501, %r497; + fma.rn.f32 %r502, %r488, %r451, %r501; + add.f32 %r503, %r502, 0f00000000; + add.f32 %r504, %r500, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r302, %r504, %r503; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd66, %r245; + cvt.u64.u32 %rd67, %r246; + shl.b64 %rd68, %rd67, 32; + or.b64 %rd69, %rd66, %rd68; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r505, %r506}, %rd69; + mul.f32 %r507, %r483, %r505; + mul.f32 %r508, %r484, %r506; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r509, %r508; + fma.rn.f32 %r510, %r489, %r454, %r509; + neg.f32 %r511, %r507; + fma.rn.f32 %r512, %r488, %r453, %r511; + add.f32 %r513, %r512, 0f00000000; + add.f32 %r514, %r510, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r303, %r514, %r513; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd70, %r247; + cvt.u64.u32 %rd71, %r248; + shl.b64 %rd72, %rd71, 32; + or.b64 %rd73, %rd70, %rd72; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r515, %r516}, %rd73; + mul.f32 %r517, %r483, %r515; + mul.f32 %r518, %r484, %r516; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r519, %r518; + fma.rn.f32 %r520, %r489, %r456, %r519; + neg.f32 %r521, %r517; + fma.rn.f32 %r522, %r488, %r455, %r521; + add.f32 %r523, %r522, 0f00000000; + add.f32 %r524, %r520, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r304, %r524, %r523; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd74, %r253; + cvt.u64.u32 %rd75, %r254; + shl.b64 %rd76, %rd75, 32; + or.b64 %rd77, %rd74, %rd76; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r525, %r526}, %rd77; + mul.f32 %r527, %r483, %r525; + mul.f32 %r528, %r484, %r526; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r529, %r528; + fma.rn.f32 %r530, %r489, %r458, %r529; + neg.f32 %r531, %r527; + fma.rn.f32 %r532, %r488, %r457, %r531; + add.f32 %r533, %r532, 0f00000000; + add.f32 %r534, %r530, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r305, %r534, %r533; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd78, %r255; + cvt.u64.u32 %rd79, %r256; + shl.b64 %rd80, %rd79, 32; + or.b64 %rd81, %rd78, %rd80; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r535, %r536}, %rd81; + mul.f32 %r537, %r483, %r535; + mul.f32 %r538, %r484, %r536; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r539, %r538; + fma.rn.f32 %r540, %r489, %r460, %r539; + neg.f32 %r541, %r537; + fma.rn.f32 %r542, %r488, %r459, %r541; + add.f32 %r543, %r542, 0f00000000; + add.f32 %r544, %r540, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r306, %r544, %r543; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd82, %r261; + cvt.u64.u32 %rd83, %r262; + shl.b64 %rd84, %rd83, 32; + or.b64 %rd85, %rd82, %rd84; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r545, %r546}, %rd85; + mul.f32 %r547, %r483, %r545; + mul.f32 %r548, %r484, %r546; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r549, %r548; + fma.rn.f32 %r550, %r489, %r462, %r549; + neg.f32 %r551, %r547; + fma.rn.f32 %r552, %r488, %r461, %r551; + add.f32 %r553, %r552, 0f00000000; + add.f32 %r554, %r550, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r307, %r554, %r553; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd86, %r263; + cvt.u64.u32 %rd87, %r264; + shl.b64 %rd88, %rd87, 32; + or.b64 %rd89, %rd86, %rd88; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r555, %r556}, %rd89; + mul.f32 %r557, %r483, %r555; + mul.f32 %r558, %r484, %r556; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r559, %r558; + fma.rn.f32 %r560, %r489, %r464, %r559; + neg.f32 %r561, %r557; + fma.rn.f32 %r562, %r488, %r463, %r561; + add.f32 %r563, %r562, 0f00000000; + add.f32 %r564, %r560, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r308, %r564, %r563; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd90, %r269; + cvt.u64.u32 %rd91, %r270; + shl.b64 %rd92, %rd91, 32; + or.b64 %rd93, %rd90, %rd92; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r565, %r566}, %rd93; + mul.f32 %r567, %r483, %r565; + mul.f32 %r568, %r484, %r566; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r569, %r568; + fma.rn.f32 %r570, %r489, %r466, %r569; + neg.f32 %r571, %r567; + fma.rn.f32 %r572, %r488, %r465, %r571; + add.f32 %r573, %r572, 0f00000000; + add.f32 %r574, %r570, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r309, %r574, %r573; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd94, %r271; + cvt.u64.u32 %rd95, %r272; + shl.b64 %rd96, %rd95, 32; + or.b64 %rd97, %rd94, %rd96; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r575, %r576}, %rd97; + mul.f32 %r577, %r483, %r575; + mul.f32 %r578, %r484, %r576; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r579, %r578; + fma.rn.f32 %r580, %r489, %r468, %r579; + neg.f32 %r581, %r577; + fma.rn.f32 %r582, %r488, %r467, %r581; + add.f32 %r583, %r582, 0f00000000; + add.f32 %r584, %r580, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r310, %r584, %r583; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd98, %r277; + cvt.u64.u32 %rd99, %r278; + shl.b64 %rd100, %rd99, 32; + or.b64 %rd101, %rd98, %rd100; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r585, %r586}, %rd101; + mul.f32 %r587, %r483, %r585; + mul.f32 %r588, %r484, %r586; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r589, %r588; + fma.rn.f32 %r590, %r489, %r470, %r589; + neg.f32 %r591, %r587; + fma.rn.f32 %r592, %r488, %r469, %r591; + add.f32 %r593, %r592, 0f00000000; + add.f32 %r594, %r590, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r311, %r594, %r593; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd102, %r279; + cvt.u64.u32 %rd103, %r280; + shl.b64 %rd104, %rd103, 32; + or.b64 %rd105, %rd102, %rd104; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r595, %r596}, %rd105; + mul.f32 %r597, %r483, %r595; + mul.f32 %r598, %r484, %r596; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r599, %r598; + fma.rn.f32 %r600, %r489, %r472, %r599; + neg.f32 %r601, %r597; + fma.rn.f32 %r602, %r488, %r471, %r601; + add.f32 %r603, %r602, 0f00000000; + add.f32 %r604, %r600, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r312, %r604, %r603; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd106, %r285; + cvt.u64.u32 %rd107, %r286; + shl.b64 %rd108, %rd107, 32; + or.b64 %rd109, %rd106, %rd108; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r605, %r606}, %rd109; + mul.f32 %r607, %r483, %r605; + mul.f32 %r608, %r484, %r606; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r609, %r608; + fma.rn.f32 %r610, %r489, %r474, %r609; + neg.f32 %r611, %r607; + fma.rn.f32 %r612, %r488, %r473, %r611; + add.f32 %r613, %r612, 0f00000000; + add.f32 %r614, %r610, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r313, %r614, %r613; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd110, %r287; + cvt.u64.u32 %rd111, %r288; + shl.b64 %rd112, %rd111, 32; + or.b64 %rd113, %rd110, %rd112; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r615, %r616}, %rd113; + mul.f32 %r617, %r483, %r615; + mul.f32 %r618, %r484, %r616; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r619, %r618; + fma.rn.f32 %r620, %r489, %r476, %r619; + neg.f32 %r621, %r617; + fma.rn.f32 %r622, %r488, %r475, %r621; + add.f32 %r623, %r622, 0f00000000; + add.f32 %r624, %r620, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r314, %r624, %r623; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd114, %r293; + cvt.u64.u32 %rd115, %r294; + shl.b64 %rd116, %rd115, 32; + or.b64 %rd117, %rd114, %rd116; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r625, %r626}, %rd117; + mul.f32 %r627, %r483, %r625; + mul.f32 %r628, %r484, %r626; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r629, %r628; + fma.rn.f32 %r630, %r489, %r478, %r629; + neg.f32 %r631, %r627; + fma.rn.f32 %r632, %r488, %r477, %r631; + add.f32 %r633, %r632, 0f00000000; + add.f32 %r634, %r630, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r315, %r634, %r633; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd118, %r295; + cvt.u64.u32 %rd119, %r296; + shl.b64 %rd120, %rd119, 32; + or.b64 %rd121, %rd118, %rd120; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r635, %r636}, %rd121; + mul.f32 %r637, %r483, %r635; + mul.f32 %r638, %r484, %r636; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r639, %r638; + fma.rn.f32 %r640, %r489, %r480, %r639; + neg.f32 %r641, %r637; + fma.rn.f32 %r642, %r488, %r479, %r641; + add.f32 %r643, %r642, 0f00000000; + add.f32 %r644, %r640, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r316, %r644, %r643; + // begin inline asm + @%p15 st.global.v4.b32 [ %rd36 + 0 ], { %r301, %r302, %r303, %r304 }; + // end inline asm + // begin inline asm + @%p16 st.global.v4.b32 [ %rd37 + 0 ], { %r305, %r306, %r307, %r308 }; + // end inline asm + // begin inline asm + @%p17 st.global.v4.b32 [ %rd38 + 0 ], { %r309, %r310, %r311, %r312 }; + // end inline asm + // begin inline asm + @%p18 st.global.v4.b32 [ %rd39 + 0 ], { %r313, %r314, %r315, %r316 }; + // end inline asm + .loc 1 158 30 // loss.py:158:30 + add.s32 %r670, %r670, 32768; + setp.lt.s32 %p31, %r670, %r19; + @%p31 bra $L__BB0_9; + bra.uni $L__BB0_4; +$L__BB0_1: + .loc 1 134 34 // loss.py:134:34 + setp.lt.s32 %p32, %r19, 1; + @%p32 bra $L__BB0_4; +// %bb.2: // %.lr.ph6.preheader + .loc 1 0 34 // loss.py:0:34 + mov.u32 %r645, %tid.x; + shl.b32 %r1, %r645, 3; + mov.b32 %r647, 0; + mov.b32 %r667, %r647; +$L__BB0_3: // %.lr.ph6 + // =>This Inner Loop Header: Depth=1 + .loc 1 135 26 // loss.py:135:26 + add.s32 %r663, %r1, %r667; + add.s32 %r664, %r663, 8192; + add.s32 %r665, %r663, 16384; + .loc 1 136 29 // loss.py:136:29 + add.s32 %r666, %r663, 24576; + setp.lt.s32 %p33, %r663, %r19; + setp.lt.s32 %p34, %r664, %r19; + setp.lt.s32 %p35, %r665, %r19; + setp.lt.s32 %p36, %r666, %r19; + .loc 1 137 34 // loss.py:137:34 + mad.wide.s32 %rd122, %r663, 2, %rd2; + add.s64 %rd123, %rd122, 16384; + add.s64 %rd124, %rd122, 32768; + add.s64 %rd125, %rd122, 49152; + .loc 1 137 43 // loss.py:137:43 + // begin inline asm + @%p33 st.global.v4.b32 [ %rd122 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + // begin inline asm + @%p34 st.global.v4.b32 [ %rd123 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + // begin inline asm + @%p35 st.global.v4.b32 [ %rd124 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + // begin inline asm + @%p36 st.global.v4.b32 [ %rd125 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + .loc 1 134 34 // loss.py:134:34 + add.s32 %r667, %r667, 32768; + setp.lt.s32 %p37, %r667, %r19; + @%p37 bra $L__BB0_3; +$L__BB0_4: // %.loopexit + .loc 1 138 8 // loss.py:138:8 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/specforge/core/loss.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 153 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x92 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 108 // DW_AT_name +.b8 111 +.b8 115 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 115 +.b8 112 +.b8 101 +.b8 99 +.b8 102 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 114 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x50:0x1e DW_TAG_subprogram +.b8 108 // DW_AT_name +.b8 111 +.b8 103 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 98 +.b8 97 +.b8 99 +.b8 107 +.b8 119 +.b8 97 +.b8 114 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x6e:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 80 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x83:0x18 DW_TAG_inlined_subroutine +.b32 80 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 155 // DW_AT_call_line +.b8 34 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source new file mode 100644 index 0000000000000000000000000000000000000000..a4cd6994ae4e49ea89eadbe375c5bf395e5b9573 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source @@ -0,0 +1,292 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc17 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc63 = loc("logits_ptr"(#loc)) +#loc64 = loc("logits_stride"(#loc)) +#loc65 = loc("target_ptr"(#loc)) +#loc66 = loc("target_stride"(#loc)) +#loc67 = loc("position_mask_ptr"(#loc)) +#loc68 = loc("grad_output_ptr"(#loc)) +#loc69 = loc("scaling_factor"(#loc)) +#loc70 = loc("m_ptr"(#loc)) +#loc71 = loc("d_ptr"(#loc)) +#loc72 = loc("n_cols"(#loc)) +#loc116 = loc("input"(#loc55)) +#loc117 = loc("a"(#loc59)) +#loc118 = loc("b"(#loc59)) +module { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %program_id = tt.get_program_id x : i32 loc(#loc73) + %program_id_0 = arith.extsi %program_id : i32 to i64 loc(#loc74) + %logits_ptr_1 = arith.extsi %logits_stride : i32 to i64 loc(#loc75) + %logits_ptr_2 = arith.muli %program_id_0, %logits_ptr_1 : i64 loc(#loc75) + %logits_ptr_3 = tt.addptr %logits_ptr, %logits_ptr_2 : !tt.ptr, i64 loc(#loc76) + %target_ptr_4 = arith.extsi %target_stride : i32 to i64 loc(#loc77) + %target_ptr_5 = arith.muli %program_id_0, %target_ptr_4 : i64 loc(#loc77) + %target_ptr_6 = tt.addptr %target_ptr, %target_ptr_5 : !tt.ptr, i64 loc(#loc78) + %position_mask_ptr_7 = tt.addptr %position_mask_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc79) + %position_mask = tt.bitcast %position_mask_ptr_7 : !tt.ptr -> !tt.ptr loc(#loc80) + %position_mask_8 = tt.load %position_mask : !tt.ptr loc(#loc80) + %position_mask_9 = arith.constant 0 : i8 loc(#loc80) + %position_mask_10 = arith.cmpi ne, %position_mask_8, %position_mask_9 : i8 loc(#loc80) + %c0_i32 = arith.constant 0 : i32 loc(#loc9) + %0 = arith.extui %position_mask_10 : i1 to i32 loc(#loc9) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc9) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc9) + ^bb1: // pred: ^bb0 + %c0_i32_11 = arith.constant 0 : i32 loc(#loc10) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc10) + %2 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc10) + %3 = arith.bitcast %n_cols : i32 to i32 loc(#loc10) + %4 = arith.bitcast %c32768_i32 : i32 to i32 loc(#loc10) + %5 = ub.poison : i32 loc(#loc10) + scf.for %i = %2 to %3 step %4 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc81) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc82) + %offsets_21 = arith.extsi %offsets_20 : tensor<32768xi32> to tensor<32768xi64> loc(#loc82) + %offsets_22 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc82) + %offsets_23 = arith.addi %offsets_21, %offsets_22 : tensor<32768xi64> loc(#loc82) + %offsets_24 = arith.constant 2147483647 : i64 loc(#loc82) + %offsets_25 = arith.constant -2147483648 : i64 loc(#loc82) + %offsets_26 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc82) + %offsets_27 = arith.cmpi sle, %offsets_23, %offsets_26 : tensor<32768xi64> loc(#loc82) + %offsets_28 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc82) + %offsets_29 = arith.cmpi sge, %offsets_23, %offsets_28 : tensor<32768xi64> loc(#loc82) + %offsets_30 = arith.andi %offsets_27, %offsets_29 : tensor<32768xi1> loc(#loc82) + %offsets_31 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc82) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc83) + %mask_32 = arith.cmpi slt, %offsets_31, %mask : tensor<32768xi32> loc(#loc83) + %14 = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc14) + %15 = tt.addptr %14, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc14) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc15) + %cst_33 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc15) + %16 = arith.truncf %cst_33 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc15) + tt.store %15, %16, %mask_32 : tensor<32768x!tt.ptr> loc(#loc15) + } loc(#loc10) + tt.return loc(#loc16) + ^bb2: // pred: ^bb0 + cf.br ^bb4 loc(#loc17) + ^bb3: // no predecessors + cf.br ^bb4 loc(#loc17) + ^bb4: // 2 preds: ^bb2, ^bb3 + %m_ptr_12 = tt.addptr %m_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc84) + %d_ptr_13 = tt.addptr %d_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc85) + %m = tt.load %m_ptr_12 : !tt.ptr loc(#loc86) + %d = tt.load %d_ptr_13 : !tt.ptr loc(#loc87) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc88) + %grad_output_14 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc89) + %target_grad_sum = arith.constant 0.000000e+00 : f32 loc(#loc90) + %c0_i32_15 = arith.constant 0 : i32 loc(#loc25) + %c32768_i32_16 = arith.constant 32768 : i32 loc(#loc25) + %6 = arith.bitcast %c0_i32_15 : i32 to i32 loc(#loc25) + %7 = arith.bitcast %n_cols : i32 to i32 loc(#loc25) + %8 = arith.bitcast %c32768_i32_16 : i32 to i32 loc(#loc25) + %9 = ub.poison : i32 loc(#loc25) + %target_grad_sum_17 = scf.for %i = %6 to %7 step %8 iter_args(%target_grad_sum_20 = %target_grad_sum) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc92) + %offsets_21 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc93) + %offsets_22 = arith.extsi %offsets_21 : tensor<32768xi32> to tensor<32768xi64> loc(#loc93) + %offsets_23 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc93) + %offsets_24 = arith.addi %offsets_22, %offsets_23 : tensor<32768xi64> loc(#loc93) + %offsets_25 = arith.constant 2147483647 : i64 loc(#loc93) + %offsets_26 = arith.constant -2147483648 : i64 loc(#loc93) + %offsets_27 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc93) + %offsets_28 = arith.cmpi sle, %offsets_24, %offsets_27 : tensor<32768xi64> loc(#loc93) + %offsets_29 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc93) + %offsets_30 = arith.cmpi sge, %offsets_24, %offsets_29 : tensor<32768xi64> loc(#loc93) + %offsets_31 = arith.andi %offsets_28, %offsets_30 : tensor<32768xi1> loc(#loc93) + %offsets_32 = arith.addi %offsets_21, %offsets : tensor<32768xi32> loc(#loc93) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc94) + %mask_33 = arith.cmpi slt, %offsets_32, %mask : tensor<32768xi32> loc(#loc94) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc95) + %target_block_34 = tt.addptr %target_block, %offsets_32 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc95) + %target_block_35 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %target_block_36 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc96) + %target_block_37 = tt.load %target_block_34, %mask_33, %target_block_36 : tensor<32768x!tt.ptr> loc(#loc96) + %target_grad_sum_38 = tt.splat %grad_output_14 : f32 -> tensor<32768xf32> loc(#loc97) + %target_grad_sum_39 = arith.mulf %target_block_37, %target_grad_sum_38 : tensor<32768xf32> loc(#loc97) + %target_grad_sum_40 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %target_grad_sum_41 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %target_grad_sum_42 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc98) + %target_grad_sum_43 = arith.select %mask_33, %target_grad_sum_39, %target_grad_sum_42 : tensor<32768xi1>, tensor<32768xf32> loc(#loc98) + %target_grad_sum_44 = tt.call @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%target_grad_sum_43) : (tensor<32768xf32>) -> f32 loc(#loc99) + %target_grad_sum_45 = arith.addf %target_grad_sum_20, %target_grad_sum_44 : f32 loc(#loc100) + scf.yield %target_grad_sum_45 : f32 loc(#loc35) + } loc(#loc91) + %c0_i32_18 = arith.constant 0 : i32 loc(#loc36) + %c32768_i32_19 = arith.constant 32768 : i32 loc(#loc36) + %10 = arith.bitcast %c0_i32_18 : i32 to i32 loc(#loc36) + %11 = arith.bitcast %n_cols : i32 to i32 loc(#loc36) + %12 = arith.bitcast %c32768_i32_19 : i32 to i32 loc(#loc36) + %13 = ub.poison : i32 loc(#loc36) + scf.for %i = %10 to %11 step %12 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc101) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc102) + %offsets_21 = arith.extsi %offsets_20 : tensor<32768xi32> to tensor<32768xi64> loc(#loc102) + %offsets_22 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc102) + %offsets_23 = arith.addi %offsets_21, %offsets_22 : tensor<32768xi64> loc(#loc102) + %offsets_24 = arith.constant 2147483647 : i64 loc(#loc102) + %offsets_25 = arith.constant -2147483648 : i64 loc(#loc102) + %offsets_26 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc102) + %offsets_27 = arith.cmpi sle, %offsets_23, %offsets_26 : tensor<32768xi64> loc(#loc102) + %offsets_28 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc102) + %offsets_29 = arith.cmpi sge, %offsets_23, %offsets_28 : tensor<32768xi64> loc(#loc102) + %offsets_30 = arith.andi %offsets_27, %offsets_29 : tensor<32768xi1> loc(#loc102) + %offsets_31 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc102) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc103) + %mask_32 = arith.cmpi slt, %offsets_31, %mask : tensor<32768xi32> loc(#loc103) + %logits_block = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc104) + %logits_block_33 = tt.addptr %logits_block, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc104) + %logits_block_34 = arith.constant 0.000000e+00 : f32 loc(#loc105) + %logits_block_35 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc105) + %logits_block_36 = arith.truncf %logits_block_35 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc105) + %logits_block_37 = tt.load %logits_block_33, %mask_32, %logits_block_36 : tensor<32768x!tt.ptr> loc(#loc105) + %logits_block_38 = arith.extf %logits_block_37 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc106) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc107) + %target_block_39 = tt.addptr %target_block, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc107) + %target_block_40 = arith.constant 0.000000e+00 : f32 loc(#loc108) + %target_block_41 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc108) + %target_block_42 = tt.load %target_block_39, %mask_32, %target_block_41 : tensor<32768x!tt.ptr> loc(#loc108) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32> loc(#loc109) + %softmax_prob_43 = arith.subf %logits_block_38, %softmax_prob : tensor<32768xf32> loc(#loc109) + %softmax_prob_44 = math.exp %softmax_prob_43 : tensor<32768xf32> loc(#loc110) + %softmax_prob_45 = tt.splat %d : f32 -> tensor<32768xf32> loc(#loc111) + %softmax_prob_46 = arith.divf %softmax_prob_44, %softmax_prob_45 : tensor<32768xf32> loc(#loc111) + %normalized_grad = tt.splat %target_grad_sum_17 : f32 -> tensor<32768xf32> loc(#loc112) + %normalized_grad_47 = arith.mulf %softmax_prob_46, %normalized_grad : tensor<32768xf32> loc(#loc112) + %grad_block = tt.splat %grad_output_14 : f32 -> tensor<32768xf32> loc(#loc113) + %grad_block_48 = arith.mulf %target_block_42, %grad_block : tensor<32768xf32> loc(#loc113) + %grad_block_49 = arith.subf %grad_block_48, %normalized_grad_47 : tensor<32768xf32> loc(#loc114) + %grad_block_50 = arith.constant 0.000000e+00 : f32 loc(#loc115) + %grad_block_51 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc115) + %grad_block_52 = arith.subf %grad_block_51, %grad_block_49 : tensor<32768xf32> loc(#loc115) + %14 = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc52) + %15 = tt.addptr %14, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc52) + %16 = arith.truncf %grad_block_52 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc53) + tt.store %15, %16, %mask_32 : tensor<32768x!tt.ptr> loc(#loc53) + } loc(#loc36) + tt.return loc(#loc54) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32768xf32> loc("input"(#loc55))) -> f32 attributes {noinline = false} { + %0 = tt.reshape %input allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc56) + %1 = "tt.reduce"(%0) <{axis = 0 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %3 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc56) + tt.reduce.return %3 : f32 loc(#loc56) + }) : (tensor<32768xf32>) -> f32 loc(#loc56) + tt.return %1 : f32 loc(#loc57) + ^bb1: // no predecessors + %2 = ub.poison : f32 loc(#loc58) + tt.return %2 : f32 loc(#loc58) + } loc(#loc55) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc59)), %b: f32 loc("b"(#loc59))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc60) + tt.return %0 : f32 loc(#loc61) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc62) + tt.return %1 : f32 loc(#loc62) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":148:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc73 = loc("program_id"(#loc1)) +#loc74 = loc("program_id"(#loc2)) +#loc75 = loc("logits_ptr"(#loc3)) +#loc76 = loc("logits_ptr"(#loc4)) +#loc77 = loc("target_ptr"(#loc5)) +#loc78 = loc("target_ptr"(#loc6)) +#loc79 = loc("position_mask_ptr"(#loc7)) +#loc80 = loc("position_mask"(#loc8)) +#loc81 = loc("offsets"(#loc11)) +#loc82 = loc("offsets"(#loc12)) +#loc83 = loc("mask"(#loc13)) +#loc84 = loc("m_ptr"(#loc18)) +#loc85 = loc("d_ptr"(#loc19)) +#loc86 = loc("m"(#loc20)) +#loc87 = loc("d"(#loc21)) +#loc88 = loc("grad_output"(#loc22)) +#loc89 = loc("grad_output"(#loc23)) +#loc90 = loc("target_grad_sum"(#loc24)) +#loc91 = loc("target_grad_sum"(#loc25)) +#loc92 = loc("offsets"(#loc26)) +#loc93 = loc("offsets"(#loc27)) +#loc94 = loc("mask"(#loc28)) +#loc95 = loc("target_block"(#loc29)) +#loc96 = loc("target_block"(#loc30)) +#loc97 = loc("target_grad_sum"(#loc31)) +#loc98 = loc("target_grad_sum"(#loc32)) +#loc99 = loc("target_grad_sum"(#loc33)) +#loc100 = loc("target_grad_sum"(#loc34)) +#loc101 = loc("offsets"(#loc37)) +#loc102 = loc("offsets"(#loc38)) +#loc103 = loc("mask"(#loc39)) +#loc104 = loc("logits_block"(#loc40)) +#loc105 = loc("logits_block"(#loc41)) +#loc106 = loc("logits_block"(#loc42)) +#loc107 = loc("target_block"(#loc43)) +#loc108 = loc("target_block"(#loc44)) +#loc109 = loc("softmax_prob"(#loc45)) +#loc110 = loc("softmax_prob"(#loc46)) +#loc111 = loc("softmax_prob"(#loc47)) +#loc112 = loc("normalized_grad"(#loc48)) +#loc113 = loc("grad_block"(#loc49)) +#loc114 = loc("grad_block"(#loc50)) +#loc115 = loc("grad_block"(#loc51)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ddf881e117876196c4d5005c7f734e00878bc4e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir @@ -0,0 +1,199 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc54 = loc("logits_ptr"(#loc)) +#loc55 = loc("logits_stride"(#loc)) +#loc56 = loc("target_ptr"(#loc)) +#loc57 = loc("target_stride"(#loc)) +#loc58 = loc("position_mask_ptr"(#loc)) +#loc59 = loc("grad_output_ptr"(#loc)) +#loc60 = loc("scaling_factor"(#loc)) +#loc61 = loc("m_ptr"(#loc)) +#loc62 = loc("d_ptr"(#loc)) +#loc63 = loc("n_cols"(#loc)) +#loc89 = loc("target_grad_sum"(#loc33)) +#loc106 = loc(callsite(#loc1 at #loc89)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xf32, #blocked> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_0 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c0_i8 = arith.constant 0 : i8 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32768xbf16, #blocked> loc(#loc1) + %program_id = tt.get_program_id x : i32 loc(#loc64) + %program_id_2 = arith.extsi %program_id : i32 to i64 loc(#loc65) + %logits_ptr_3 = arith.extsi %logits_stride : i32 to i64 loc(#loc66) + %logits_ptr_4 = arith.muli %program_id_2, %logits_ptr_3 : i64 loc(#loc66) + %logits_ptr_5 = tt.addptr %logits_ptr, %logits_ptr_4 : !tt.ptr, i64 loc(#loc67) + %target_ptr_6 = arith.extsi %target_stride : i32 to i64 loc(#loc68) + %target_ptr_7 = arith.muli %program_id_2, %target_ptr_6 : i64 loc(#loc68) + %target_ptr_8 = tt.addptr %target_ptr, %target_ptr_7 : !tt.ptr, i64 loc(#loc69) + %position_mask_ptr_9 = tt.addptr %position_mask_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc70) + %position_mask = tt.bitcast %position_mask_ptr_9 : !tt.ptr -> !tt.ptr loc(#loc71) + %position_mask_10 = tt.load %position_mask : !tt.ptr loc(#loc71) + %position_mask_11 = arith.cmpi ne, %position_mask_10, %c0_i8 : i8 loc(#loc71) + %0 = arith.extui %position_mask_11 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc72) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc73) + %2 = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc13) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc74) + %offsets_20 = arith.addi %offsets_19, %offsets : tensor<32768xi32, #blocked> loc(#loc74) + %mask_21 = arith.cmpi slt, %offsets_20, %mask : tensor<32768xi32, #blocked> loc(#loc73) + %3 = tt.addptr %2, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc13) + tt.store %3, %cst_1, %mask_21 : tensor<32768x!tt.ptr, #blocked> loc(#loc16) + } loc(#loc14) + tt.return loc(#loc17) + ^bb2: // pred: ^bb0 + %m_ptr_12 = tt.addptr %m_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc75) + %d_ptr_13 = tt.addptr %d_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc76) + %m = tt.load %m_ptr_12 : !tt.ptr loc(#loc77) + %d = tt.load %d_ptr_13 : !tt.ptr loc(#loc78) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc79) + %grad_output_14 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc80) + %offsets_15 = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc81) + %mask_16 = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc82) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc83) + %target_grad_sum = tt.splat %grad_output_14 : f32 -> tensor<32768xf32, #blocked> loc(#loc84) + %target_grad_sum_17 = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%target_grad_sum_19 = %cst_0) -> (f32) : i32 { + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc86) + %offsets_21 = arith.addi %offsets_20, %offsets_15 : tensor<32768xi32, #blocked> loc(#loc86) + %mask_22 = arith.cmpi slt, %offsets_21, %mask_16 : tensor<32768xi32, #blocked> loc(#loc82) + %target_block_23 = tt.addptr %target_block, %offsets_21 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc83) + %target_block_24 = tt.load %target_block_23, %mask_22, %cst : tensor<32768x!tt.ptr, #blocked> loc(#loc87) + %target_grad_sum_25 = arith.mulf %target_block_24, %target_grad_sum : tensor<32768xf32, #blocked> loc(#loc84) + %target_grad_sum_26 = arith.select %mask_22, %target_grad_sum_25, %cst : tensor<32768xi1, #blocked>, tensor<32768xf32, #blocked> loc(#loc88) + %target_grad_sum_27 = tt.reshape %target_grad_sum_26 allow_reorder : tensor<32768xf32, #blocked> -> tensor<32768xf32, #blocked1> loc(#loc105) + %target_grad_sum_28 = "tt.reduce"(%target_grad_sum_27) <{axis = 0 : i32}> ({ + ^bb0(%target_grad_sum_30: f32 loc(callsite(#loc1 at #loc89)), %target_grad_sum_31: f32 loc(callsite(#loc1 at #loc89))): + %target_grad_sum_32 = arith.addf %target_grad_sum_30, %target_grad_sum_31 : f32 loc(#loc107) + tt.reduce.return %target_grad_sum_32 : f32 loc(#loc105) + }) : (tensor<32768xf32, #blocked1>) -> f32 loc(#loc105) + %target_grad_sum_29 = arith.addf %target_grad_sum_19, %target_grad_sum_28 : f32 loc(#loc90) + scf.yield %target_grad_sum_29 : f32 loc(#loc36) + } loc(#loc85) + %logits_block = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc91) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32, #blocked> loc(#loc92) + %softmax_prob_18 = tt.splat %d : f32 -> tensor<32768xf32, #blocked> loc(#loc93) + %normalized_grad = tt.splat %target_grad_sum_17 : f32 -> tensor<32768xf32, #blocked> loc(#loc94) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc95) + %offsets_20 = arith.addi %offsets_19, %offsets_15 : tensor<32768xi32, #blocked> loc(#loc95) + %mask_21 = arith.cmpi slt, %offsets_20, %mask_16 : tensor<32768xi32, #blocked> loc(#loc96) + %logits_block_22 = tt.addptr %logits_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc91) + %logits_block_23 = tt.load %logits_block_22, %mask_21, %cst_1 : tensor<32768x!tt.ptr, #blocked> loc(#loc97) + %logits_block_24 = arith.extf %logits_block_23 : tensor<32768xbf16, #blocked> to tensor<32768xf32, #blocked> loc(#loc98) + %target_block_25 = tt.addptr %target_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc99) + %target_block_26 = tt.load %target_block_25, %mask_21, %cst : tensor<32768x!tt.ptr, #blocked> loc(#loc100) + %softmax_prob_27 = arith.subf %logits_block_24, %softmax_prob : tensor<32768xf32, #blocked> loc(#loc92) + %softmax_prob_28 = math.exp %softmax_prob_27 : tensor<32768xf32, #blocked> loc(#loc101) + %softmax_prob_29 = arith.divf %softmax_prob_28, %softmax_prob_18 : tensor<32768xf32, #blocked> loc(#loc93) + %normalized_grad_30 = arith.mulf %softmax_prob_29, %normalized_grad : tensor<32768xf32, #blocked> loc(#loc94) + %grad_block = arith.mulf %target_block_26, %target_grad_sum : tensor<32768xf32, #blocked> loc(#loc102) + %grad_block_31 = arith.subf %grad_block, %normalized_grad_30 : tensor<32768xf32, #blocked> loc(#loc103) + %grad_block_32 = arith.subf %cst, %grad_block_31 : tensor<32768xf32, #blocked> loc(#loc104) + %3 = arith.truncf %grad_block_32 : tensor<32768xf32, #blocked> to tensor<32768xbf16, #blocked> loc(#loc52) + tt.store %logits_block_22, %3, %mask_21 : tensor<32768x!tt.ptr, #blocked> loc(#loc52) + } loc(#loc41) + tt.return loc(#loc53) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc64 = loc("program_id"(#loc2)) +#loc65 = loc("program_id"(#loc3)) +#loc66 = loc("logits_ptr"(#loc4)) +#loc67 = loc("logits_ptr"(#loc5)) +#loc68 = loc("target_ptr"(#loc6)) +#loc69 = loc("target_ptr"(#loc7)) +#loc70 = loc("position_mask_ptr"(#loc8)) +#loc71 = loc("position_mask"(#loc9)) +#loc72 = loc("offsets"(#loc11)) +#loc73 = loc("mask"(#loc12)) +#loc74 = loc("offsets"(#loc15)) +#loc75 = loc("m_ptr"(#loc18)) +#loc76 = loc("d_ptr"(#loc19)) +#loc77 = loc("m"(#loc20)) +#loc78 = loc("d"(#loc21)) +#loc79 = loc("grad_output"(#loc22)) +#loc80 = loc("grad_output"(#loc23)) +#loc81 = loc("offsets"(#loc24)) +#loc82 = loc("mask"(#loc25)) +#loc83 = loc("target_block"(#loc26)) +#loc84 = loc("target_grad_sum"(#loc27)) +#loc85 = loc("target_grad_sum"(#loc28)) +#loc86 = loc("offsets"(#loc29)) +#loc87 = loc("target_block"(#loc30)) +#loc88 = loc("target_grad_sum"(#loc31)) +#loc90 = loc("target_grad_sum"(#loc35)) +#loc91 = loc("logits_block"(#loc37)) +#loc92 = loc("softmax_prob"(#loc38)) +#loc93 = loc("softmax_prob"(#loc39)) +#loc94 = loc("normalized_grad"(#loc40)) +#loc95 = loc("offsets"(#loc42)) +#loc96 = loc("mask"(#loc43)) +#loc97 = loc("logits_block"(#loc44)) +#loc98 = loc("logits_block"(#loc45)) +#loc99 = loc("target_block"(#loc46)) +#loc100 = loc("target_block"(#loc47)) +#loc101 = loc("softmax_prob"(#loc48)) +#loc102 = loc("grad_block"(#loc49)) +#loc103 = loc("grad_block"(#loc50)) +#loc104 = loc("grad_block"(#loc51)) +#loc105 = loc(callsite(#loc32 at #loc89)) +#loc107 = loc(callsite(#loc34 at #loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b5bc4459f3fc35c5d5c780663109ce8a4c361d3d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir @@ -0,0 +1,203 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc55 = loc("logits_ptr"(#loc)) +#loc56 = loc("logits_stride"(#loc)) +#loc57 = loc("target_ptr"(#loc)) +#loc58 = loc("target_stride"(#loc)) +#loc59 = loc("position_mask_ptr"(#loc)) +#loc60 = loc("grad_output_ptr"(#loc)) +#loc61 = loc("scaling_factor"(#loc)) +#loc62 = loc("m_ptr"(#loc)) +#loc63 = loc("d_ptr"(#loc)) +#loc64 = loc("n_cols"(#loc)) +#loc90 = loc("target_grad_sum"(#loc33)) +#loc108 = loc(callsite(#loc1 at #loc90)) +module { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xbf16> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %position_mask = arith.constant 0 : i8 loc(#loc65) + %program_id = tt.get_program_id x : i32 loc(#loc66) + %program_id_2 = arith.extsi %program_id : i32 to i64 loc(#loc67) + %logits_ptr_3 = arith.extsi %logits_stride : i32 to i64 loc(#loc68) + %logits_ptr_4 = arith.muli %program_id_2, %logits_ptr_3 : i64 loc(#loc68) + %logits_ptr_5 = tt.addptr %logits_ptr, %logits_ptr_4 : !tt.ptr, i64 loc(#loc69) + %target_ptr_6 = arith.extsi %target_stride : i32 to i64 loc(#loc70) + %target_ptr_7 = arith.muli %program_id_2, %target_ptr_6 : i64 loc(#loc70) + %target_ptr_8 = tt.addptr %target_ptr, %target_ptr_7 : !tt.ptr, i64 loc(#loc71) + %position_mask_ptr_9 = tt.addptr %position_mask_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc72) + %position_mask_10 = tt.bitcast %position_mask_ptr_9 : !tt.ptr -> !tt.ptr loc(#loc65) + %position_mask_11 = tt.load %position_mask_10 : !tt.ptr loc(#loc65) + %position_mask_12 = arith.cmpi ne, %position_mask_11, %position_mask : i8 loc(#loc65) + %0 = arith.extui %position_mask_12 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc73) + %offsets_16 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc74) + %offsets_17 = arith.addi %offsets_16, %offsets : tensor<32768xi32> loc(#loc74) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc75) + %mask_18 = arith.cmpi slt, %offsets_17, %mask : tensor<32768xi32> loc(#loc75) + %2 = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc15) + %3 = tt.addptr %2, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc15) + tt.store %3, %cst, %mask_18 : tensor<32768x!tt.ptr> loc(#loc16) + } loc(#loc11) + tt.return loc(#loc17) + ^bb2: // pred: ^bb0 + %m_ptr_13 = tt.addptr %m_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc76) + %d_ptr_14 = tt.addptr %d_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc77) + %m = tt.load %m_ptr_13 : !tt.ptr loc(#loc78) + %d = tt.load %d_ptr_14 : !tt.ptr loc(#loc79) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc80) + %grad_output_15 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc81) + %target_grad_sum = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%target_grad_sum_16 = %cst_1) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc83) + %offsets_17 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc84) + %offsets_18 = arith.addi %offsets_17, %offsets : tensor<32768xi32> loc(#loc84) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc85) + %mask_19 = arith.cmpi slt, %offsets_18, %mask : tensor<32768xi32> loc(#loc85) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc86) + %target_block_20 = tt.addptr %target_block, %offsets_18 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc86) + %target_block_21 = tt.load %target_block_20, %mask_19, %cst_0 : tensor<32768x!tt.ptr> loc(#loc87) + %target_grad_sum_22 = tt.splat %grad_output_15 : f32 -> tensor<32768xf32> loc(#loc88) + %target_grad_sum_23 = arith.mulf %target_block_21, %target_grad_sum_22 : tensor<32768xf32> loc(#loc88) + %target_grad_sum_24 = arith.select %mask_19, %target_grad_sum_23, %cst_0 : tensor<32768xi1>, tensor<32768xf32> loc(#loc89) + %target_grad_sum_25 = tt.reshape %target_grad_sum_24 allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc107) + %target_grad_sum_26 = "tt.reduce"(%target_grad_sum_25) <{axis = 0 : i32}> ({ + ^bb0(%target_grad_sum_28: f32 loc(callsite(#loc1 at #loc90)), %target_grad_sum_29: f32 loc(callsite(#loc1 at #loc90))): + %target_grad_sum_30 = arith.addf %target_grad_sum_28, %target_grad_sum_29 : f32 loc(#loc109) + tt.reduce.return %target_grad_sum_30 : f32 loc(#loc107) + }) : (tensor<32768xf32>) -> f32 loc(#loc107) + %target_grad_sum_27 = arith.addf %target_grad_sum_16, %target_grad_sum_26 : f32 loc(#loc91) + scf.yield %target_grad_sum_27 : f32 loc(#loc36) + } loc(#loc82) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc92) + %offsets_16 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc93) + %offsets_17 = arith.addi %offsets_16, %offsets : tensor<32768xi32> loc(#loc93) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc94) + %mask_18 = arith.cmpi slt, %offsets_17, %mask : tensor<32768xi32> loc(#loc94) + %logits_block = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc95) + %logits_block_19 = tt.addptr %logits_block, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc95) + %logits_block_20 = tt.load %logits_block_19, %mask_18, %cst : tensor<32768x!tt.ptr> loc(#loc96) + %logits_block_21 = arith.extf %logits_block_20 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc97) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc98) + %target_block_22 = tt.addptr %target_block, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc98) + %target_block_23 = tt.load %target_block_22, %mask_18, %cst_0 : tensor<32768x!tt.ptr> loc(#loc99) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32> loc(#loc100) + %softmax_prob_24 = arith.subf %logits_block_21, %softmax_prob : tensor<32768xf32> loc(#loc100) + %softmax_prob_25 = math.exp %softmax_prob_24 : tensor<32768xf32> loc(#loc101) + %softmax_prob_26 = tt.splat %d : f32 -> tensor<32768xf32> loc(#loc102) + %softmax_prob_27 = arith.divf %softmax_prob_25, %softmax_prob_26 : tensor<32768xf32> loc(#loc102) + %normalized_grad = tt.splat %target_grad_sum : f32 -> tensor<32768xf32> loc(#loc103) + %normalized_grad_28 = arith.mulf %softmax_prob_27, %normalized_grad : tensor<32768xf32> loc(#loc103) + %grad_block = tt.splat %grad_output_15 : f32 -> tensor<32768xf32> loc(#loc104) + %grad_block_29 = arith.mulf %target_block_23, %grad_block : tensor<32768xf32> loc(#loc104) + %grad_block_30 = arith.subf %grad_block_29, %normalized_grad_28 : tensor<32768xf32> loc(#loc105) + %grad_block_31 = arith.subf %cst_0, %grad_block_30 : tensor<32768xf32> loc(#loc106) + %2 = arith.truncf %grad_block_31 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc53) + tt.store %logits_block_19, %2, %mask_18 : tensor<32768x!tt.ptr> loc(#loc53) + } loc(#loc37) + tt.return loc(#loc54) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:35) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc65 = loc("position_mask"(#loc2)) +#loc66 = loc("program_id"(#loc3)) +#loc67 = loc("program_id"(#loc4)) +#loc68 = loc("logits_ptr"(#loc5)) +#loc69 = loc("logits_ptr"(#loc6)) +#loc70 = loc("target_ptr"(#loc7)) +#loc71 = loc("target_ptr"(#loc8)) +#loc72 = loc("position_mask_ptr"(#loc9)) +#loc73 = loc("offsets"(#loc12)) +#loc74 = loc("offsets"(#loc13)) +#loc75 = loc("mask"(#loc14)) +#loc76 = loc("m_ptr"(#loc18)) +#loc77 = loc("d_ptr"(#loc19)) +#loc78 = loc("m"(#loc20)) +#loc79 = loc("d"(#loc21)) +#loc80 = loc("grad_output"(#loc22)) +#loc81 = loc("grad_output"(#loc23)) +#loc82 = loc("target_grad_sum"(#loc24)) +#loc83 = loc("offsets"(#loc25)) +#loc84 = loc("offsets"(#loc26)) +#loc85 = loc("mask"(#loc27)) +#loc86 = loc("target_block"(#loc28)) +#loc87 = loc("target_block"(#loc29)) +#loc88 = loc("target_grad_sum"(#loc30)) +#loc89 = loc("target_grad_sum"(#loc31)) +#loc91 = loc("target_grad_sum"(#loc35)) +#loc92 = loc("offsets"(#loc38)) +#loc93 = loc("offsets"(#loc39)) +#loc94 = loc("mask"(#loc40)) +#loc95 = loc("logits_block"(#loc41)) +#loc96 = loc("logits_block"(#loc42)) +#loc97 = loc("logits_block"(#loc43)) +#loc98 = loc("target_block"(#loc44)) +#loc99 = loc("target_block"(#loc45)) +#loc100 = loc("softmax_prob"(#loc46)) +#loc101 = loc("softmax_prob"(#loc47)) +#loc102 = loc("softmax_prob"(#loc48)) +#loc103 = loc("normalized_grad"(#loc49)) +#loc104 = loc("grad_block"(#loc50)) +#loc105 = loc("grad_block"(#loc51)) +#loc106 = loc("grad_block"(#loc52)) +#loc107 = loc(callsite(#loc32 at #loc90)) +#loc109 = loc(callsite(#loc34 at #loc107)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..390b5cda51ba4bc4973e56711507ce42b3f0244e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..696e928d6e14ab57284029138acb5c7d729a923a Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..662886a077c39e0c593a49265f74272afae8679c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "8ac911daa2a533d701fdd1240df5c99a179eb64893d2fcd04f63beccf895f168", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..0e72e44276feadcec2248e15a09e5ff465274092 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir @@ -0,0 +1,215 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 6, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 504, !dbg !9 + %12 = lshr exact i32 %11, 3, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !11 + %15 = and i32 %14, 56, !dbg !11 + %16 = sdiv i32 %13, 2048, !dbg !12 + %17 = mul i32 %16, 2048, !dbg !13 + %.decomposed = sub i32 %13, %17, !dbg !13 + %18 = srem i32 %16, 32, !dbg !14 + %19 = sdiv i32 %13, 65536, !dbg !15 + %20 = shl nsw i32 %18, 7, !dbg !16 + %21 = shl nsw i32 %.decomposed, 12, !dbg !17 + %22 = shl i32 %19, 23, !dbg !18 + %23 = shl i32 %13, 7, !dbg !19 + %24 = add i32 %22, %21 + %25 = add i32 %24, %20 + %26 = zext nneg i32 %15 to i64, !dbg !20 + %27 = sext i32 %23 to i64, !dbg !20 + %28 = or disjoint i32 %25, %15, !dbg !21 + %29 = sext i32 %28 to i64, !dbg !22 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %29, !dbg !22 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %30, i64 %31, i1 true) #4, !dbg !23 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !23 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !23 + %35 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !23 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !23 + %37 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !23 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !23 + %39 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !23 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !23 + %41 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !24 + %42 = getelementptr bfloat, ptr addrspace(1) %41, i64 %27, !dbg !24 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %42, i64 %43, i1 true) #4, !dbg !25 + %45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !25 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !25 + %47 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !25 + %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !25 + %49 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !25 + %50 = bitcast i32 %49 to <2 x bfloat>, !dbg !25 + %51 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !25 + %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !25 + %53 = or disjoint i64 %26, 64, !dbg !26 + %54 = trunc nuw nsw i64 %53 to i32, !dbg !21 + %55 = or disjoint i32 %25, %54, !dbg !21 + %56 = sext i32 %55 to i64, !dbg !22 + %57 = getelementptr bfloat, ptr addrspace(1) %0, i64 %56, !dbg !22 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %58, i1 true) #4, !dbg !23 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !23 + %61 = bitcast i32 %60 to <2 x bfloat>, !dbg !23 + %62 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !23 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !23 + %64 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !23 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !23 + %66 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !23 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !23 + %68 = getelementptr bfloat, ptr addrspace(1) %1, i64 %53, !dbg !24 + %69 = getelementptr bfloat, ptr addrspace(1) %68, i64 %27, !dbg !24 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %69, i64 %70, i1 true) #4, !dbg !25 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !25 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !25 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !25 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !25 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !25 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !25 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !25 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !25 + %80 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !27 + %81 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !28 + %82 = fmul <2 x float> %80, %81, !dbg !29 + %83 = fadd <2 x float> %82, zeroinitializer, !dbg !30 + %84 = fpext <2 x bfloat> %61 to <2 x float>, !dbg !27 + %85 = fpext <2 x bfloat> %73 to <2 x float>, !dbg !28 + %86 = fmul <2 x float> %84, %85, !dbg !29 + %87 = fadd <2 x float> %83, %86, !dbg !30 + %88 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !27 + %89 = fpext <2 x bfloat> %48 to <2 x float>, !dbg !28 + %90 = fmul <2 x float> %88, %89, !dbg !29 + %91 = fadd <2 x float> %90, zeroinitializer, !dbg !30 + %92 = fpext <2 x bfloat> %63 to <2 x float>, !dbg !27 + %93 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !28 + %94 = fmul <2 x float> %92, %93, !dbg !29 + %95 = fadd <2 x float> %91, %94, !dbg !30 + %96 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !27 + %97 = fpext <2 x bfloat> %50 to <2 x float>, !dbg !28 + %98 = fmul <2 x float> %96, %97, !dbg !29 + %99 = fadd <2 x float> %98, zeroinitializer, !dbg !30 + %100 = fpext <2 x bfloat> %65 to <2 x float>, !dbg !27 + %101 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !28 + %102 = fmul <2 x float> %100, %101, !dbg !29 + %103 = fadd <2 x float> %99, %102, !dbg !30 + %104 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !27 + %105 = fpext <2 x bfloat> %52 to <2 x float>, !dbg !28 + %106 = fmul <2 x float> %104, %105, !dbg !29 + %107 = fadd <2 x float> %106, zeroinitializer, !dbg !30 + %108 = fpext <2 x bfloat> %67 to <2 x float>, !dbg !27 + %109 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !28 + %110 = fmul <2 x float> %108, %109, !dbg !29 + %111 = fadd <2 x float> %107, %110, !dbg !30 + %112 = and i32 %10, 63, !dbg !9 + %113 = or disjoint i32 %9, %112, !dbg !10 + %shift = shufflevector <2 x float> %87, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop = fadd <2 x float> %87, %shift, !dbg !31 + %foldExtExtBinop9 = fadd <2 x float> %95, %foldExtExtBinop, !dbg !31 + %shift11 = shufflevector <2 x float> %95, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !31 + %foldExtExtBinop14 = fadd <2 x float> %103, %foldExtExtBinop12, !dbg !31 + %shift16 = shufflevector <2 x float> %103, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !31 + %foldExtExtBinop19 = fadd <2 x float> %111, %foldExtExtBinop17, !dbg !31 + %shift21 = shufflevector <2 x float> %111, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !31 + %114 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !31 + %115 = bitcast float %114 to i32, !dbg !35 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 4, i32 31), !dbg !35 + %117 = bitcast i32 %116 to float, !dbg !35 + %118 = fadd float %114, %117, !dbg !31 + %119 = bitcast float %118 to i32, !dbg !35 + %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 2, i32 31), !dbg !35 + %121 = bitcast i32 %120 to float, !dbg !35 + %122 = fadd float %118, %121, !dbg !31 + %123 = bitcast float %122 to i32, !dbg !35 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !35 + %125 = bitcast i32 %124 to float, !dbg !35 + %126 = fadd float %122, %125, !dbg !31 + %127 = lshr exact i32 %11, 1, !dbg !36 + %128 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %127, !dbg !36 + store float %126, ptr addrspace(3) %128, align 4, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %129 = shl nuw nsw i32 %112, 2, !dbg !36 + %130 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %129, !dbg !36 + %131 = load i32, ptr addrspace(3) %130, align 4, !dbg !36 + %132 = sext i32 %113 to i64, !dbg !37 + %133 = getelementptr float, ptr addrspace(1) %2, i64 %132, !dbg !37 + %134 = and i32 %10, 448, !dbg !38 + %135 = icmp eq i32 %134, 0, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %131, ptr addrspace(1) %133, i1 %135) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 40, column: 45, scope: !4) +!20 = !DILocation(line: 33, column: 40, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 34, scope: !4) +!25 = !DILocation(line: 40, column: 50, scope: !4) +!26 = !DILocation(line: 34, column: 31, scope: !4) +!27 = !DILocation(line: 39, column: 127, scope: !4) +!28 = !DILocation(line: 40, column: 104, scope: !4) +!29 = !DILocation(line: 41, column: 22, scope: !4) +!30 = !DILocation(line: 43, column: 23, scope: !4) +!31 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 45, column: 28, scope: !4) +!37 = !DILocation(line: 49, column: 25, scope: !4) +!38 = !DILocation(line: 49, column: 36, scope: !4) +!39 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..772b3c2578e89d95b36b67ee20f5fef3daae482a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,511 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 512 +{ + .reg .pred %p<6>; + .reg .b16 %rs<33>; + .reg .b32 %r<131>; + .reg .b64 %rd<23>; + .loc 1 18 0 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:23:28 + mov.u32 %r34, %ctaid.x; + .loc 1 23 33 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:23:33 + shl.b32 %r35, %r34, 6; + ld.param.b64 %rd16, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:24:44 + mov.u32 %r36, %tid.x; + and.b32 %r37, %r36, 504; + bfe.u32 %r38, %r36, 3, 6; + .loc 1 24 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:24:23 + or.b32 %r39, %r38, %r35; + .loc 1 26 37 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:26:37 + shl.b32 %r40, %r36, 3; + and.b32 %r41, %r40, 56; + .loc 1 29 21 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:29:21 + bfe.s32 %r42, %r34, 25, 1; + shr.u32 %r43, %r42, 21; + add.s32 %r44, %r39, %r43; + shr.s32 %r45, %r44, 11; + .loc 1 28 19 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:28:19 + and.b32 %r46, %r44, 1046528; + sub.s32 %r47, %r39, %r46; + .loc 1 29 29 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:29:29 + shr.u32 %r48, %r45, 27; + add.s32 %r49, %r45, %r48; + and.b32 %r50, %r49, 33554400; + sub.s32 %r51, %r45, %r50; + .loc 1 30 19 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:30:19 + shr.u32 %r52, %r42, 16; + add.s32 %r53, %r39, %r52; + .loc 1 39 45 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:45 + shl.b32 %r54, %r51, 7; + .loc 1 39 55 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:55 + shl.b32 %r55, %r47, 12; + .loc 1 39 68 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:68 + shl.b32 %r56, %r53, 7; + and.b32 %r57, %r56, -8388608; + .loc 1 40 45 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:45 + shl.b32 %r58, %r39, 7; + add.s32 %r59, %r57, %r55; + add.s32 %r60, %r59, %r54; + .loc 1 33 40 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:33:40 + cvt.u64.u32 %rd17, %r41; + .loc 1 39 60 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:60 + or.b32 %r61, %r60, %r41; + .loc 1 39 34 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:34 + mad.wide.s32 %rd2, %r61, 2, %rd14; + .loc 1 39 73 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 34 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:34 + mad.wide.u32 %rd18, %r41, 2, %rd15; + mad.wide.s32 %rd5, %r58, 2, %rd18; + .loc 1 40 50 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, %r5; + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 34 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:34 + cvt.s64.s32 %rd19, %r60; + or.b64 %rd20, %rd19, %rd17; + shl.b64 %rd21, %rd20, 1; + add.s64 %rd22, %rd14, %rd21; + add.s64 %rd8, %rd22, 128; + .loc 1 39 73 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:73 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r5; + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 40 34 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:34 + add.s64 %rd11, %rd5, 128; + .loc 1 40 50 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:50 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd11 + 0 ], %rd12; + // end inline asm + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r62, %rs2; + cvt.f32.bf16 %r63, %rs1; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs3, %rs4}, %r9; + cvt.f32.bf16 %r64, %rs4; + cvt.f32.bf16 %r65, %rs3; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r66, %r63, %r65, 0f00000000; + fma.rn.f32 %r67, %r62, %r64, 0f00000000; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs5, %rs6}, %r17; + cvt.f32.bf16 %r68, %rs5; + cvt.f32.bf16 %r69, %rs6; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs7, %rs8}, %r25; + cvt.f32.bf16 %r70, %rs7; + cvt.f32.bf16 %r71, %rs8; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r72, %r69, %r71, %r67; + fma.rn.f32 %r73, %r68, %r70, %r66; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs9, %rs10}, %r2; + cvt.f32.bf16 %r74, %rs10; + cvt.f32.bf16 %r75, %rs9; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r76, %rs12; + cvt.f32.bf16 %r77, %rs11; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r78, %r75, %r77, 0f00000000; + fma.rn.f32 %r79, %r74, %r76, 0f00000000; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs13, %rs14}, %r18; + cvt.f32.bf16 %r80, %rs13; + cvt.f32.bf16 %r81, %rs14; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r82, %rs15; + cvt.f32.bf16 %r83, %rs16; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs17, %rs18}, %r3; + cvt.f32.bf16 %r86, %rs18; + cvt.f32.bf16 %r87, %rs17; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs19, %rs20}, %r11; + cvt.f32.bf16 %r88, %rs20; + cvt.f32.bf16 %r89, %rs19; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r90, %r87, %r89, 0f00000000; + fma.rn.f32 %r91, %r86, %r88, 0f00000000; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs21, %rs22}, %r19; + cvt.f32.bf16 %r92, %rs21; + cvt.f32.bf16 %r93, %rs22; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs23, %rs24}, %r27; + cvt.f32.bf16 %r94, %rs23; + cvt.f32.bf16 %r95, %rs24; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r96, %r93, %r95, %r91; + fma.rn.f32 %r97, %r92, %r94, %r90; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs25, %rs26}, %r4; + cvt.f32.bf16 %r98, %rs26; + cvt.f32.bf16 %r99, %rs25; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs27, %rs28}, %r12; + cvt.f32.bf16 %r100, %rs28; + cvt.f32.bf16 %r101, %rs27; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r102, %r99, %r101, 0f00000000; + fma.rn.f32 %r103, %r98, %r100, 0f00000000; + .loc 1 39 127 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:39:127 + mov.b32 {%rs29, %rs30}, %r20; + cvt.f32.bf16 %r104, %rs29; + cvt.f32.bf16 %r105, %rs30; + .loc 1 40 104 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:40:104 + mov.b32 {%rs31, %rs32}, %r28; + cvt.f32.bf16 %r106, %rs31; + cvt.f32.bf16 %r107, %rs32; + .loc 1 43 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:43:23 + fma.rn.f32 %r108, %r105, %r107, %r103; + fma.rn.f32 %r109, %r104, %r106, %r102; + .loc 1 24 44 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:24:44 + and.b32 %r110, %r36, 63; + .loc 1 24 23 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:24:23 + or.b32 %r111, %r35, %r110; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + add.f32 %r112, %r73, %r72; + add.f32 %r113, %r85, %r112; + add.f32 %r114, %r84, %r113; + add.f32 %r115, %r97, %r114; + add.f32 %r116, %r96, %r115; + add.f32 %r117, %r109, %r116; + add.f32 %r118, %r108, %r117; + .loc 2 291 36 // standard.py:291:36 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + shfl.sync.bfly.b32 %r119, %r118, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + add.f32 %r120, %r118, %r119; + .loc 2 291 36 // standard.py:291:36 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + shfl.sync.bfly.b32 %r121, %r120, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + add.f32 %r122, %r120, %r121; + .loc 2 291 36 // standard.py:291:36 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:25 ] + add.f32 %r124, %r122, %r123; +$L__tmp2: + .loc 1 45 28 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:45:28 + shr.u32 %r125, %r37, 1; + mov.b32 %r126, global_smem; + add.s32 %r127, %r126, %r125; + st.shared.b32 [%r127], %r124; + bar.sync 0; + shl.b32 %r128, %r110, 2; + add.s32 %r129, %r126, %r128; + ld.shared.b32 %r33, [%r129]; + .loc 1 49 25 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:49:25 + mad.wide.s32 %rd13, %r111, 4, %rd16; + .loc 1 49 36 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:49:36 + and.b32 %r130, %r36, 448; + setp.eq.b32 %p5, %r130, 0; + // begin inline asm + @%p5 st.global.b32 [ %rd13 + 0 ], { %r33 }; + // end inline asm + .loc 1 49 4 // cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 51 +.b8 103 +.b8 117 +.b8 119 +.b8 110 +.b8 119 +.b8 105 +.b8 111 +.b8 120 +.b8 51 +.b8 121 +.b8 122 +.b8 122 +.b8 106 +.b8 116 +.b8 97 +.b8 113 +.b8 117 +.b8 104 +.b8 54 +.b8 107 +.b8 52 +.b8 115 +.b8 109 +.b8 54 +.b8 110 +.b8 110 +.b8 52 +.b8 108 +.b8 99 +.b8 109 +.b8 107 +.b8 101 +.b8 112 +.b8 53 +.b8 54 +.b8 114 +.b8 111 +.b8 112 +.b8 51 +.b8 103 +.b8 114 +.b8 113 +.b8 114 +.b8 52 +.b8 52 +.b8 120 +.b8 111 +.b8 114 +.b8 104 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 99 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..586a3a530ec43dbc5eaeb2552b0feb548cd2b7e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc65) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c64_i32 = arith.constant 64 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x64xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x64xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x64xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x64xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x64xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x64xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x64xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x64xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..623d4fde1ebfdad501b9ef2a9372b2a06b622c4a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,154 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %_tmp4_28 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg6 = %cst_7) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_28 : i32 -> tensor<1x64xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x64xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst : tensor<1x64xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x64xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %arg6, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %arg6 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x64xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7faeaa02efcdd0e88e92784eded29dc82059e345 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,148 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":45:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("in_ptr1"(#loc)) +#loc42 = loc("out_ptr1"(#loc)) +#loc43 = loc("xnumel"(#loc)) +#loc44 = loc("r0_numel"(#loc)) +#loc75 = loc("tmp4"(#loc34)) +#loc78 = loc(callsite(#loc1 at #loc75)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc45) + %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc46) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc48) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc49) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc50) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc51) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc51) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc52) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc53) + %x1_10 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc54) + %x1_11 = arith.remsi %x1_10, %x1 : tensor<64x1xi32> loc(#loc46) + %x2_12 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc45) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_14 = %cst_4) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc56) + %r0_index_15 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc56) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst_3 : tensor<1x64xi32> loc(#loc57) + %tmp0 = arith.muli %x1_11, %cst_2 : tensor<64x1xi32> loc(#loc58) + %tmp0_16 = tt.broadcast %r0_index_15 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_17 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<64x64xi32> loc(#loc59) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc60) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc61) + %tmp0_21 = arith.addi %tmp0_18, %tmp0_20 : tensor<64x64xi32> loc(#loc61) + %tmp0_22 = arith.muli %x2_12, %cst_0 : tensor<64x1xi32> loc(#loc62) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<64x64xi32> loc(#loc63) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc64) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc64) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc65) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc65) + %tmp0_29 = arith.extf %tmp0_28 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc66) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc67) + %tmp1_30 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc68) + %tmp1_31 = arith.addi %tmp0_16, %tmp1_30 : tensor<64x64xi32> loc(#loc68) + %tmp1_32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc69) + %tmp1_33 = tt.addptr %tmp1_32, %tmp1_31 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc69) + %tmp1_34 = tt.load %tmp1_33, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc70) + %tmp1_35 = arith.extf %tmp1_34 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc71) + %tmp2 = arith.mulf %tmp0_29, %tmp1_35 : tensor<64x64xf32> loc(#loc72) + %tmp5 = arith.addf %_tmp4_14, %tmp2 : tensor<64x64xf32> loc(#loc73) + %_tmp4_36 = arith.select %tmp0_27, %tmp5, %_tmp4_14 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc74) + scf.yield %_tmp4_36 : tensor<64x64xf32> loc(#loc32) + } loc(#loc55) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_14: f32 loc(callsite(#loc1 at #loc75)), %tmp4_15: f32 loc(callsite(#loc1 at #loc75))): + %tmp4_16 = arith.addf %tmp4_14, %tmp4_15 : f32 loc(#loc79) + tt.reduce.return %tmp4_16 : f32 loc(#loc77) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc77) + %tmp4_13 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc76) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc37) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc37) + tt.store %1, %tmp4_13 : tensor<64x1x!tt.ptr> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":34:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":35:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:55) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:68) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":39:127) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:45) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:41) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:50) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":40:104) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":41:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":43:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":44:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":44:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":45:28) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/c3/cc3guwnwiox3yzzjtaquh6k4sm6nn4lcmkep56rop3grqr44xorh.py":49:4) +#loc45 = loc("x2"(#loc3)) +#loc46 = loc("x1"(#loc4)) +#loc47 = loc("xoffset"(#loc5)) +#loc48 = loc("xoffset"(#loc6)) +#loc49 = loc("xindex"(#loc7)) +#loc50 = loc("xindex"(#loc8)) +#loc51 = loc("xindex"(#loc9)) +#loc52 = loc("r0_base"(#loc10)) +#loc53 = loc("x0"(#loc11)) +#loc54 = loc("x1"(#loc12)) +#loc55 = loc("_tmp4"(#loc2)) +#loc56 = loc("r0_index"(#loc13)) +#loc57 = loc("r0_mask"(#loc14)) +#loc58 = loc("tmp0"(#loc15)) +#loc59 = loc("tmp0"(#loc16)) +#loc60 = loc("tmp0"(#loc17)) +#loc61 = loc("tmp0"(#loc18)) +#loc62 = loc("tmp0"(#loc19)) +#loc63 = loc("tmp0"(#loc20)) +#loc64 = loc("tmp0"(#loc21)) +#loc65 = loc("tmp0"(#loc22)) +#loc66 = loc("tmp0"(#loc23)) +#loc67 = loc("tmp1"(#loc24)) +#loc68 = loc("tmp1"(#loc25)) +#loc69 = loc("tmp1"(#loc26)) +#loc70 = loc("tmp1"(#loc27)) +#loc71 = loc("tmp1"(#loc28)) +#loc72 = loc("tmp2"(#loc29)) +#loc73 = loc("tmp5"(#loc30)) +#loc74 = loc("_tmp4"(#loc31)) +#loc76 = loc("tmp4"(#loc36)) +#loc77 = loc(callsite(#loc33 at #loc75)) +#loc79 = loc(callsite(#loc35 at #loc77)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b92550feb9e369b5301851de32d2d228aa0309a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7860a41b954adb1b91a3f1cadd8c5fb4897521d6 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e61b1473136112cdb2c008c73e5ff893eb6238e4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "a86b11120b5411cbf0d8b8f8917a84c667a2fd685341aa47866d51603d9e328a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..d00531652f41fff1c40bbc7a2a68aea0a33b611b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir @@ -0,0 +1,450 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = icmp slt i32 %9, %4, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 511, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %.frozen = freeze i64 %2, !dbg !11 + %14 = sdiv i64 %13, %.frozen, !dbg !11 + %15 = mul i64 %14, %.frozen, !dbg !10 + %.decomposed = sub i64 %13, %15, !dbg !10 + %16 = mul i64 %14, %3, !dbg !12 + %.idx = mul nuw nsw i64 %.decomposed, 128000 + %17 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %17, i64 %16, !dbg !13 + %18 = zext nneg i32 %12 to i64, !dbg !13 + %19 = insertelement <2 x i1> poison, i1 %10, i64 0, !dbg !14 + %20 = shufflevector <2 x i1> %19, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !15 + br label %21, !dbg !13 + +21: ; preds = %8, %21 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %21 ] + %22 = phi i32 [ 2147483647, %8 ], [ %106, %21 ] + %23 = phi i32 [ 2147483647, %8 ], [ %107, %21 ] + %24 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %104, %21 ] + %25 = phi <2 x i32> [ splat (i32 2147483647), %8 ], [ %108, %21 ] + %26 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %105, %21 ] + %27 = or disjoint i64 %indvars.iv, %18, !dbg !16 + %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !16 + %29 = or disjoint i32 %11, %28, !dbg !16 + %30 = or i32 %29, 512, !dbg !16 + %31 = or disjoint i64 %27, 1024, !dbg !16 + %32 = or i32 %29, 1536, !dbg !16 + %33 = icmp samesign ult i32 %30, 32000, !dbg !17 + %34 = icmp samesign ult i64 %31, 32000, !dbg !17 + %35 = icmp samesign ult i32 %32, 32000, !dbg !17 + %36 = zext nneg i32 %30 to i64, !dbg !18 + %37 = zext nneg i32 %32 to i64, !dbg !18 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %27, !dbg !19 + %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19 + %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %31, !dbg !19 + %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19 + %38 = and i1 %10, %33, !dbg !15 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %39, i1 %10) #4, !dbg !20 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %42 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep3, i64 %41, i1 %38) #4, !dbg !20 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %44 = fcmp uno <2 x float> %24, zeroinitializer, !dbg !21 + %45 = fcmp uno <2 x float> %26, zeroinitializer, !dbg !21 + %46 = sext i32 %22 to i64, !dbg !25 + %47 = icmp sgt i64 %27, %46, !dbg !25 + %48 = icmp slt i32 %23, %30, !dbg !25 + %49 = extractelement <2 x i32> %25, i64 0, !dbg !25 + %50 = sext i32 %49 to i64, !dbg !25 + %51 = icmp sgt i64 %31, %50, !dbg !25 + %52 = extractelement <2 x i32> %25, i64 1, !dbg !25 + %53 = icmp slt i32 %52, %32, !dbg !25 + %54 = insertelement <2 x i32> poison, i32 %40, i64 0, !dbg !20 + %55 = insertelement <2 x i32> %54, i32 %42, i64 1, !dbg !20 + %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !20 + %57 = fcmp ogt <2 x float> %24, %56, !dbg !26 + %58 = fcmp oeq <2 x float> %24, %56, !dbg !27 + %59 = fcmp uno <2 x float> %56, zeroinitializer, !dbg !28 + %60 = xor <2 x i1> %59, splat (i1 true), !dbg !29 + %61 = and <2 x i1> %44, %60, !dbg !30 + %62 = or <2 x i1> %57, %61, !dbg !31 + %63 = and <2 x i1> %44, %59, !dbg !32 + %64 = or <2 x i1> %58, %63, !dbg !33 + %65 = insertelement <2 x i1> poison, i1 %47, i64 0, !dbg !34 + %66 = insertelement <2 x i1> %65, i1 %48, i64 1, !dbg !34 + %67 = and <2 x i1> %66, %64, !dbg !34 + %68 = or <2 x i1> %62, %67, !dbg !35 + %69 = select <2 x i1> %68, <2 x float> %24, <2 x float> %56, !dbg !36 + %70 = trunc nuw nsw i64 %27 to i32, !dbg !37 + %71 = extractelement <2 x i1> %68, i64 0, !dbg !37 + %72 = select i1 %71, i32 %22, i32 %70, !dbg !37 + %73 = extractelement <2 x i1> %68, i64 1, !dbg !37 + %74 = select i1 %73, i32 %23, i32 %30, !dbg !37 + %75 = trunc nuw nsw i64 %31 to i32, !dbg !37 + %76 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !15 + %77 = insertelement <2 x i1> %76, i1 %35, i64 1, !dbg !15 + %78 = and <2 x i1> %20, %77, !dbg !15 + %79 = extractelement <2 x i1> %78, i64 0, !dbg !20 + %80 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep5, i64 %43, i1 %79) #4, !dbg !20 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %82 = extractelement <2 x i1> %78, i64 1, !dbg !20 + %83 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep7, i64 %81, i1 %82) #4, !dbg !20 + %84 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !20 + %85 = insertelement <2 x i32> %84, i32 %83, i64 1, !dbg !20 + %86 = bitcast <2 x i32> %85 to <2 x float>, !dbg !20 + %87 = fcmp ogt <2 x float> %26, %86, !dbg !26 + %88 = fcmp oeq <2 x float> %26, %86, !dbg !27 + %89 = fcmp uno <2 x float> %86, zeroinitializer, !dbg !28 + %90 = xor <2 x i1> %89, splat (i1 true), !dbg !29 + %91 = and <2 x i1> %45, %90, !dbg !30 + %92 = or <2 x i1> %87, %91, !dbg !31 + %93 = and <2 x i1> %45, %89, !dbg !32 + %94 = or <2 x i1> %88, %93, !dbg !33 + %95 = insertelement <2 x i1> poison, i1 %51, i64 0, !dbg !34 + %96 = insertelement <2 x i1> %95, i1 %53, i64 1, !dbg !34 + %97 = and <2 x i1> %96, %94, !dbg !34 + %98 = or <2 x i1> %92, %97, !dbg !35 + %99 = select <2 x i1> %98, <2 x float> %26, <2 x float> %86, !dbg !36 + %100 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !37 + %101 = insertelement <2 x i32> %100, i32 %32, i64 1, !dbg !37 + %102 = select <2 x i1> %98, <2 x i32> %25, <2 x i32> %101, !dbg !37 + %103 = insertelement <2 x i1> %19, i1 %38, i64 1, !dbg !14 + %104 = select <2 x i1> %103, <2 x float> %69, <2 x float> %24, !dbg !14 + %105 = select <2 x i1> %78, <2 x float> %99, <2 x float> %26, !dbg !14 + %106 = select i1 %10, i32 %72, i32 %22, !dbg !38 + %107 = select i1 %38, i32 %74, i32 %23, !dbg !38 + %108 = select <2 x i1> %78, <2 x i32> %102, <2 x i32> %25, !dbg !38 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !13 + %109 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !13 + br i1 %109, label %21, label %110, !dbg !13 + +110: ; preds = %21 + %111 = and i32 %11, 31, !dbg !9 + %112 = lshr i32 %11, 5, !dbg !9 + %113 = shufflevector <2 x float> %104, <2 x float> poison, <2 x i32> , !dbg !39 + %114 = fcmp ogt <2 x float> %104, %113, !dbg !39 + %115 = fcmp oeq <2 x float> %104, %113, !dbg !39 + %116 = shufflevector <2 x i1> %114, <2 x i1> %115, <2 x i32> , !dbg !39 + %117 = extractelement <2 x float> %104, i64 0, !dbg !41 + %118 = fcmp uno float %117, 0.000000e+00, !dbg !41 + %119 = extractelement <2 x float> %104, i64 1, !dbg !42 + %120 = fcmp uno float %119, 0.000000e+00, !dbg !42 + %121 = xor i1 %120, true, !dbg !43 + %122 = insertelement <2 x i1> poison, i1 %118, i64 0, !dbg !44 + %123 = shufflevector <2 x i1> %122, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44 + %124 = insertelement <2 x i1> poison, i1 %121, i64 0, !dbg !44 + %125 = insertelement <2 x i1> %124, i1 %120, i64 1, !dbg !44 + %126 = and <2 x i1> %123, %125, !dbg !44 + %127 = or <2 x i1> %116, %126, !dbg !45 + %128 = icmp slt i32 %106, %107, !dbg !46 + %129 = extractelement <2 x i1> %127, i64 1, !dbg !47 + %130 = and i1 %128, %129, !dbg !47 + %131 = extractelement <2 x i1> %127, i64 0, !dbg !48 + %132 = or i1 %131, %130, !dbg !48 + %133 = select i1 %132, float %117, float %119, !dbg !49 + %134 = select i1 %132, i32 %106, i32 %107, !dbg !50 + %135 = extractelement <2 x float> %105, i64 0, !dbg !39 + %136 = fcmp ogt float %133, %135, !dbg !39 + %137 = fcmp oeq float %133, %135, !dbg !51 + %138 = fcmp uno float %133, 0.000000e+00, !dbg !41 + %139 = fcmp uno float %135, 0.000000e+00, !dbg !42 + %140 = xor i1 %139, true, !dbg !43 + %141 = and i1 %138, %140, !dbg !44 + %142 = or i1 %136, %141, !dbg !45 + %143 = and i1 %139, %138, !dbg !52 + %144 = or i1 %137, %143, !dbg !53 + %145 = extractelement <2 x i32> %108, i64 0, !dbg !46 + %146 = icmp slt i32 %134, %145, !dbg !46 + %147 = and i1 %146, %144, !dbg !47 + %148 = or i1 %142, %147, !dbg !48 + %149 = select i1 %148, float %133, float %135, !dbg !49 + %150 = select i1 %148, i32 %134, i32 %145, !dbg !50 + %151 = extractelement <2 x float> %105, i64 1, !dbg !39 + %152 = fcmp ogt float %149, %151, !dbg !39 + %153 = fcmp oeq float %149, %151, !dbg !51 + %154 = fcmp uno float %149, 0.000000e+00, !dbg !41 + %155 = fcmp uno float %151, 0.000000e+00, !dbg !42 + %156 = xor i1 %155, true, !dbg !43 + %157 = and i1 %154, %156, !dbg !44 + %158 = or i1 %152, %157, !dbg !45 + %159 = and i1 %155, %154, !dbg !52 + %160 = or i1 %153, %159, !dbg !53 + %161 = extractelement <2 x i32> %108, i64 1, !dbg !46 + %162 = icmp slt i32 %150, %161, !dbg !46 + %163 = and i1 %162, %160, !dbg !47 + %164 = or i1 %158, %163, !dbg !48 + %165 = select i1 %164, float %149, float %151, !dbg !49 + %166 = select i1 %164, i32 %150, i32 %161, !dbg !50 + %167 = bitcast float %165 to i32, !dbg !54 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 16, i32 31), !dbg !54 + %169 = bitcast i32 %168 to float, !dbg !54 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 16, i32 31), !dbg !54 + %171 = fcmp ogt float %165, %169, !dbg !39 + %172 = fcmp oeq float %165, %169, !dbg !51 + %173 = fcmp uno float %165, 0.000000e+00, !dbg !41 + %174 = fcmp uno float %169, 0.000000e+00, !dbg !42 + %175 = xor i1 %174, true, !dbg !43 + %176 = and i1 %173, %175, !dbg !44 + %177 = or i1 %171, %176, !dbg !45 + %178 = and i1 %173, %174, !dbg !52 + %179 = or i1 %172, %178, !dbg !53 + %180 = icmp slt i32 %166, %170, !dbg !46 + %181 = and i1 %180, %179, !dbg !47 + %182 = or i1 %177, %181, !dbg !48 + %183 = select i1 %182, float %165, float %169, !dbg !49 + %184 = select i1 %182, i32 %166, i32 %170, !dbg !50 + %185 = bitcast float %183 to i32, !dbg !54 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 8, i32 31), !dbg !54 + %187 = bitcast i32 %186 to float, !dbg !54 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 8, i32 31), !dbg !54 + %189 = fcmp ogt float %183, %187, !dbg !39 + %190 = fcmp oeq float %183, %187, !dbg !51 + %191 = fcmp uno float %183, 0.000000e+00, !dbg !41 + %192 = fcmp uno float %187, 0.000000e+00, !dbg !42 + %193 = xor i1 %192, true, !dbg !43 + %194 = and i1 %191, %193, !dbg !44 + %195 = or i1 %189, %194, !dbg !45 + %196 = and i1 %192, %191, !dbg !52 + %197 = or i1 %190, %196, !dbg !53 + %198 = icmp slt i32 %184, %188, !dbg !46 + %199 = and i1 %198, %197, !dbg !47 + %200 = or i1 %195, %199, !dbg !48 + %201 = select i1 %200, float %183, float %187, !dbg !49 + %202 = select i1 %200, i32 %184, i32 %188, !dbg !50 + %203 = bitcast float %201 to i32, !dbg !54 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !54 + %205 = bitcast i32 %204 to float, !dbg !54 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 4, i32 31), !dbg !54 + %207 = fcmp ogt float %201, %205, !dbg !39 + %208 = fcmp oeq float %201, %205, !dbg !51 + %209 = fcmp uno float %201, 0.000000e+00, !dbg !41 + %210 = fcmp uno float %205, 0.000000e+00, !dbg !42 + %211 = xor i1 %210, true, !dbg !43 + %212 = and i1 %209, %211, !dbg !44 + %213 = or i1 %207, %212, !dbg !45 + %214 = and i1 %210, %209, !dbg !52 + %215 = or i1 %208, %214, !dbg !53 + %216 = icmp slt i32 %202, %206, !dbg !46 + %217 = and i1 %216, %215, !dbg !47 + %218 = or i1 %213, %217, !dbg !48 + %219 = select i1 %218, float %201, float %205, !dbg !49 + %220 = select i1 %218, i32 %202, i32 %206, !dbg !50 + %221 = bitcast float %219 to i32, !dbg !54 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !54 + %223 = bitcast i32 %222 to float, !dbg !54 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !54 + %225 = fcmp ogt float %219, %223, !dbg !39 + %226 = fcmp oeq float %219, %223, !dbg !51 + %227 = fcmp uno float %219, 0.000000e+00, !dbg !41 + %228 = fcmp uno float %223, 0.000000e+00, !dbg !42 + %229 = xor i1 %228, true, !dbg !43 + %230 = and i1 %227, %229, !dbg !44 + %231 = or i1 %225, %230, !dbg !45 + %232 = and i1 %228, %227, !dbg !52 + %233 = or i1 %226, %232, !dbg !53 + %234 = icmp slt i32 %220, %224, !dbg !46 + %235 = and i1 %234, %233, !dbg !47 + %236 = or i1 %231, %235, !dbg !48 + %237 = select i1 %236, float %219, float %223, !dbg !49 + %238 = select i1 %236, i32 %220, i32 %224, !dbg !50 + %239 = bitcast float %237 to i32, !dbg !54 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !54 + %241 = bitcast i32 %240 to float, !dbg !54 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !54 + %243 = fcmp ogt float %237, %241, !dbg !39 + %244 = fcmp oeq float %237, %241, !dbg !51 + %245 = fcmp uno float %237, 0.000000e+00, !dbg !41 + %246 = fcmp uno float %241, 0.000000e+00, !dbg !42 + %247 = xor i1 %246, true, !dbg !43 + %248 = and i1 %245, %247, !dbg !44 + %249 = or i1 %243, %248, !dbg !45 + %250 = and i1 %246, %245, !dbg !52 + %251 = or i1 %244, %250, !dbg !53 + %252 = icmp slt i32 %238, %242, !dbg !46 + %253 = and i1 %252, %251, !dbg !47 + %254 = or i1 %249, %253, !dbg !48 + %255 = select i1 %254, i32 %238, i32 %242, !dbg !50 + %256 = and i32 %112, 15, !dbg !54 + %257 = icmp eq i32 %111, 0, !dbg !54 + %258 = getelementptr float, ptr addrspace(3) @global_smem, i32 %256, !dbg !54 + %259 = select i1 %254, i32 %239, i32 %240, !dbg !49 + %260 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %260, i1 %257) #4, !dbg !54 + %261 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %256, !dbg !54 + %262 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, <1 x i32> %262, i1 %257) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %263 = icmp samesign ult i32 %11, 16, !dbg !54 + %264 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !54 + %265 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %264, i1 %263) #4, !dbg !54 + %266 = bitcast i32 %265 to float, !dbg !54 + %267 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !54 + %268 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 %263) #4, !dbg !54 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !54 + %270 = bitcast i32 %269 to float, !dbg !54 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 8, i32 31), !dbg !54 + %272 = fcmp ogt float %266, %270, !dbg !39 + %273 = fcmp oeq float %266, %270, !dbg !51 + %274 = fcmp uno float %266, 0.000000e+00, !dbg !41 + %275 = fcmp uno float %270, 0.000000e+00, !dbg !42 + %276 = xor i1 %275, true, !dbg !43 + %277 = and i1 %274, %276, !dbg !44 + %278 = or i1 %272, %277, !dbg !45 + %279 = and i1 %274, %275, !dbg !52 + %280 = or i1 %273, %279, !dbg !53 + %281 = icmp slt i32 %268, %271, !dbg !46 + %282 = and i1 %281, %280, !dbg !47 + %283 = or i1 %278, %282, !dbg !48 + %284 = select i1 %283, float %266, float %270, !dbg !49 + %285 = select i1 %283, i32 %268, i32 %271, !dbg !50 + %286 = bitcast float %284 to i32, !dbg !54 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 4, i32 31), !dbg !54 + %288 = bitcast i32 %287 to float, !dbg !54 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !54 + %290 = fcmp ogt float %284, %288, !dbg !39 + %291 = fcmp oeq float %284, %288, !dbg !51 + %292 = fcmp uno float %284, 0.000000e+00, !dbg !41 + %293 = fcmp uno float %288, 0.000000e+00, !dbg !42 + %294 = xor i1 %293, true, !dbg !43 + %295 = and i1 %292, %294, !dbg !44 + %296 = or i1 %290, %295, !dbg !45 + %297 = and i1 %293, %292, !dbg !52 + %298 = or i1 %291, %297, !dbg !53 + %299 = icmp slt i32 %285, %289, !dbg !46 + %300 = and i1 %299, %298, !dbg !47 + %301 = or i1 %296, %300, !dbg !48 + %302 = select i1 %301, float %284, float %288, !dbg !49 + %303 = select i1 %301, i32 %285, i32 %289, !dbg !50 + %304 = bitcast float %302 to i32, !dbg !54 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !54 + %306 = bitcast i32 %305 to float, !dbg !54 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !54 + %308 = fcmp ogt float %302, %306, !dbg !39 + %309 = fcmp oeq float %302, %306, !dbg !51 + %310 = fcmp uno float %302, 0.000000e+00, !dbg !41 + %311 = fcmp uno float %306, 0.000000e+00, !dbg !42 + %312 = xor i1 %311, true, !dbg !43 + %313 = and i1 %310, %312, !dbg !44 + %314 = or i1 %308, %313, !dbg !45 + %315 = and i1 %311, %310, !dbg !52 + %316 = or i1 %309, %315, !dbg !53 + %317 = icmp slt i32 %303, %307, !dbg !46 + %318 = and i1 %317, %316, !dbg !47 + %319 = or i1 %314, %318, !dbg !48 + %320 = select i1 %319, float %302, float %306, !dbg !49 + %321 = select i1 %319, i32 %303, i32 %307, !dbg !50 + %322 = bitcast float %320 to i32, !dbg !54 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !54 + %324 = bitcast i32 %323 to float, !dbg !54 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 1, i32 31), !dbg !54 + %326 = fcmp ogt float %320, %324, !dbg !39 + %327 = fcmp oeq float %320, %324, !dbg !51 + %328 = fcmp uno float %320, 0.000000e+00, !dbg !41 + %329 = fcmp uno float %324, 0.000000e+00, !dbg !42 + %330 = xor i1 %329, true, !dbg !43 + %331 = and i1 %328, %330, !dbg !44 + %332 = or i1 %326, %331, !dbg !45 + %333 = and i1 %329, %328, !dbg !52 + %334 = or i1 %327, %333, !dbg !53 + %335 = icmp slt i32 %321, %325, !dbg !46 + %336 = and i1 %335, %334, !dbg !47 + %337 = or i1 %332, %336, !dbg !48 + %338 = select i1 %337, i32 %321, i32 %325, !dbg !50 + %339 = icmp eq i32 %11, 0, !dbg !54 + %340 = select i1 %337, i32 %322, i32 %323, !dbg !49 + %341 = insertelement <1 x i32> poison, i32 %340, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %264, <1 x i32> %341, i1 %339) #4, !dbg !54 + %342 = insertelement <1 x i32> poison, i32 %338, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %342, i1 %339) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %343 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !54 + %344 = getelementptr i64, ptr addrspace(1) %1, i64 %13, !dbg !55 + %345 = sext i32 %343 to i64, !dbg !56 + %346 = icmp eq i32 %12, 0, !dbg !56 + %347 = and i1 %346, %10, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %345, ptr addrspace(1) %344, i1 %347) #4, !dbg !56 + ret void, !dbg !57 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 19, scope: !4) +!12 = !DILocation(line: 38, column: 56, scope: !4) +!13 = !DILocation(line: 32, column: 40, scope: !4) +!14 = !DILocation(line: 43, column: 54, scope: !4) +!15 = !DILocation(line: 38, column: 71, scope: !4) +!16 = !DILocation(line: 33, column: 31, scope: !4) +!17 = !DILocation(line: 34, column: 29, scope: !4) +!18 = !DILocation(line: 38, column: 41, scope: !4) +!19 = !DILocation(line: 38, column: 34, scope: !4) +!20 = !DILocation(line: 38, column: 61, scope: !4) +!21 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!24 = !DILocation(line: 41, column: 38, scope: !4) +!25 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !24) +!26 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !24) +!27 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !24) +!28 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !24) +!29 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !24) +!30 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !24) +!31 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !24) +!32 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !24) +!33 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !24) +!34 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !24) +!35 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !24) +!36 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !24) +!37 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !24) +!38 = !DILocation(line: 44, column: 66, scope: !4) +!39 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !40) +!40 = !DILocation(line: 45, column: 75, scope: !4) +!41 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !40) +!42 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !40) +!43 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !40) +!44 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !40) +!45 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !40) +!46 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !40) +!47 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !40) +!48 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !40) +!49 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !40) +!50 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !40) +!51 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !40) +!52 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !40) +!53 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !40) +!54 = !DILocation(line: 165, column: 42, scope: !22, inlinedAt: !40) +!55 = !DILocation(line: 47, column: 25, scope: !4) +!56 = !DILocation(line: 47, column: 36, scope: !4) +!57 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..808102353f650fd519c368230bafb51af069222a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,915 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<215>; + .reg .b32 %r<123>; + .reg .b64 %rd<70>; + .loc 1 18 0 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:18:0 + +// %bb.0: + ld.param.b32 %r10, [triton_red_fused_argmax_1_param_4]; + ld.param.b64 %rd20, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd18, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 22 28 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:22:28 + mov.u32 %r11, %ctaid.x; + ld.param.b64 %rd21, [triton_red_fused_argmax_1_param_2]; + .loc 1 25 37 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:25:37 + mov.u32 %r12, %tid.x; + and.b32 %r1, %r12, 511; + .loc 1 27 19 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:27:19 + cvt.u64.u32 %rd2, %r11; + .loc 1 28 19 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:28:19 + and.b64 %rd22, %rd21, -4294967296; + setp.ne.b64 %p5, %rd22, 0; + cvt.u32.u64 %r118, %rd2; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd65, %rd2, %rd21; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r13, %rd21; + div.u32 %r15, %r118, %r13; + cvt.u64.u32 %rd65, %r15; +$L__BB0_3: + .loc 1 0 19 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:0:19 + ld.param.b64 %rd19, [triton_red_fused_argmax_1_param_1]; + cvt.u64.u32 %rd1, %r12; + .loc 1 24 21 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:24:21 + setp.lt.s32 %p3, %r118, %r10; + .loc 1 27 19 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:27:19 + mul.lo.s64 %rd25, %rd65, %rd21; + sub.s64 %rd26, %rd2, %rd25; + .loc 1 38 56 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:56 + mul.lo.s64 %rd27, %rd65, %rd20; + mad.lo.s64 %rd28, %rd26, 128000, %rd18; + .loc 1 32 40 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:32:40 + shl.b64 %rd29, %rd27, 2; + add.s64 %rd7, %rd28, %rd29; + cvt.u64.u32 %rd8, %r1; + shl.b64 %rd30, %rd20, 2; + mul.lo.s64 %rd31, %rd21, 128000; + sub.s64 %rd32, %rd30, %rd31; + mul.lo.s64 %rd33, %rd65, %rd32; + mad.lo.s64 %rd34, %rd2, 128000, %rd33; + mad.wide.u32 %rd35, %r1, 4, %rd34; + add.s64 %rd66, %rd18, %rd35; + mov.b32 %r20, 0fFF800000; + mov.b64 %rd68, {%r20, %r20}; + mov.b32 %r119, 2147483647; + mov.b64 %rd67, -2048; + mov.b32 %r120, %r119; + mov.b32 %r121, %r119; + mov.b32 %r122, %r119; + mov.b64 %rd69, %rd68; +$L__BB0_4: // =>This Inner Loop Header: Depth=1 + .loc 1 33 31 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:33:31 + add.s64 %rd48, %rd8, %rd67; + add.s64 %rd49, %rd48, 2048; + add.s64 %rd50, %rd1, %rd67; + cvt.u32.u64 %r30, %rd50; + add.s32 %r31, %r30, 2048; + or.b32 %r32, %r31, 512; + add.s64 %rd51, %rd48, 3072; + or.b32 %r33, %r31, 1536; + .loc 1 34 29 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:34:29 + setp.lt.u32 %p10, %r32, 32000; + setp.lt.u64 %p11, %rd51, 32000; + setp.lt.u32 %p12, %r33, 32000; + .loc 1 38 34 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:34 + mad.wide.u32 %rd40, %r32, 4, %rd7; + add.s64 %rd43, %rd66, 4096; + mad.wide.u32 %rd46, %r33, 4, %rd7; + .loc 1 38 71 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:71 + and.pred %p7, %p3, %p10; + .loc 1 38 61 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:61 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + mov.b32 %r22, 0; + // begin inline asm + mov.u32 %r21, %r22; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r21 }, [ %rd66 + 0 ], %rd36; + // end inline asm + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r22; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b32 { %r23 }, [ %rd40 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + mov.b64 {%r34, %r35}, %rd68; + setp.nan.f32 %p13, %r34, %r34; + setp.nan.f32 %p14, %r35, %r35; + mov.b64 {%r36, %r37}, %rd69; + setp.nan.f32 %p15, %r36, %r36; + setp.nan.f32 %p16, %r37, %r37; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + cvt.s64.s32 %rd52, %r119; + setp.gt.s64 %p17, %rd49, %rd52; + setp.lt.s32 %p18, %r120, %r32; + cvt.s64.s32 %rd53, %r121; + setp.gt.s64 %p19, %rd51, %rd53; + setp.lt.s32 %p20, %r122, %r33; +$L__tmp2: + .loc 1 38 61 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:61 + cvt.u64.u32 %rd54, %r23; + shl.b64 %rd55, %rd54, 32; + cvt.u64.u32 %rd56, %r21; + or.b64 %rd57, %rd56, %rd55; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + mov.b64 {%r38, %r39}, %rd57; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + setp.gt.f32 %p21, %r35, %r39; + setp.gt.f32 %p22, %r34, %r38; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + setp.eq.f32 %p23, %r34, %r38; + setp.eq.f32 %p24, %r35, %r39; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + setp.nan.f32 %p25, %r39, %r39; + setp.nan.f32 %p26, %r38, %r38; + setp.num.f32 %p27, %r38, %r38; + setp.num.f32 %p28, %r39, %r39; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + and.pred %p29, %p14, %p28; + and.pred %p30, %p13, %p27; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + or.pred %p31, %p22, %p30; + or.pred %p32, %p21, %p29; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + and.pred %p33, %p13, %p26; + and.pred %p34, %p14, %p25; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + or.pred %p35, %p24, %p34; + or.pred %p36, %p23, %p33; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + and.pred %p37, %p17, %p36; + and.pred %p38, %p18, %p35; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + or.pred %p39, %p32, %p38; + or.pred %p40, %p31, %p37; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + selp.f32 %r40, %r34, %r38, %p40; + selp.f32 %r41, %r35, %r39, %p39; + cvt.u32.u64 %r42, %rd49; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + selp.b32 %r43, %r119, %r42, %p40; + selp.b32 %r44, %r120, %r32, %p39; +$L__tmp4: + .loc 1 38 71 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:71 + and.pred %p9, %p3, %p12; + and.pred %p8, %p3, %p11; + .loc 1 38 61 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:38:61 + // begin inline asm + mov.u32 %r25, %r22; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b32 { %r25 }, [ %rd43 + 0 ], %rd42; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, %r22; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b32 { %r27 }, [ %rd46 + 0 ], %rd45; + // end inline asm + cvt.u64.u32 %rd58, %r27; + shl.b64 %rd59, %rd58, 32; + cvt.u64.u32 %rd60, %r25; + or.b64 %rd61, %rd60, %rd59; +$L__tmp5: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + mov.b64 {%r45, %r46}, %rd61; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + setp.gt.f32 %p41, %r37, %r46; + setp.gt.f32 %p42, %r36, %r45; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + setp.eq.f32 %p43, %r36, %r45; + setp.eq.f32 %p44, %r37, %r46; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + setp.nan.f32 %p45, %r46, %r46; + setp.nan.f32 %p46, %r45, %r45; + setp.num.f32 %p47, %r45, %r45; + setp.num.f32 %p48, %r46, %r46; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + and.pred %p49, %p16, %p48; + and.pred %p50, %p15, %p47; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + or.pred %p51, %p42, %p50; + or.pred %p52, %p41, %p49; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + and.pred %p53, %p15, %p46; + and.pred %p54, %p16, %p45; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + or.pred %p55, %p44, %p54; + or.pred %p56, %p43, %p53; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + and.pred %p57, %p19, %p56; + and.pred %p58, %p20, %p55; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + or.pred %p59, %p52, %p58; + or.pred %p60, %p51, %p57; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + selp.f32 %r47, %r36, %r45, %p60; + selp.f32 %r48, %r37, %r46, %p59; + cvt.u32.u64 %r49, %rd51; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:41:38 ] + selp.b32 %r50, %r121, %r49, %p60; + selp.b32 %r51, %r122, %r33, %p59; +$L__tmp6: + .loc 1 43 54 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:43:54 + selp.f32 %r52, %r41, %r35, %p7; + selp.f32 %r53, %r40, %r34, %p3; + mov.b64 %rd68, {%r53, %r52}; + selp.f32 %r54, %r48, %r37, %p9; + selp.f32 %r55, %r47, %r36, %p8; + mov.b64 %rd69, {%r55, %r54}; + .loc 1 44 66 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:44:66 + selp.b32 %r119, %r43, %r119, %p3; + selp.b32 %r120, %r44, %r120, %p7; + selp.b32 %r122, %r51, %r122, %p9; + selp.b32 %r121, %r50, %r121, %p8; + .loc 1 32 40 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:32:40 + add.s64 %rd67, %rd67, 2048; + add.s64 %rd66, %rd66, 8192; + setp.lt.u64 %p61, %rd67, 29952; + @%p61 bra $L__BB0_4; +// %bb.5: + .loc 1 0 40 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:0:40 + cvt.u32.u64 %r68, %rd1; + .loc 1 25 37 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:25:37 + and.b32 %r70, %r68, 31; +$L__tmp7: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + mov.b64 {%r71, %r72}, %rd68; + setp.gt.f32 %p70, %r71, %r72; + setp.eq.f32 %p71, %r72, %r71; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p72, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.num.f32 %p73, %r72, %r72; + setp.nan.f32 %p74, %r72, %r72; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p72, %p73; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p71, %p75; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p79, %r119, %r120; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p81, %p77, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r73, %r71, %r72, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r74, %r119, %r120, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + mov.b64 {%r75, %r76}, %rd69; + setp.gt.f32 %p82, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p83, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p84, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p85, %r75, %r75; + setp.num.f32 %p86, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p91, %r74, %r121; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r77, %r73, %r75, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r78, %r74, %r121, %p93; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p94, %r77, %r76; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p95, %r77, %r76; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p96, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p97, %r76, %r76; + setp.num.f32 %p98, %r76, %r76; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p101, %p97, %p96; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p103, %r78, %r122; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r79, %r77, %r76, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r80, %r78, %r122, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r81, %r79, 16, 31, -1; + shfl.sync.bfly.b32 %r82, %r80, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p106, %r79, %r81; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p107, %r79, %r81; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p108, %r79, %r79; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p109, %r81, %r81; + setp.num.f32 %p110, %r81, %r81; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p113, %p108, %p109; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p115, %r80, %r82; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r83, %r79, %r81, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r84, %r80, %r82, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r85, %r83, 8, 31, -1; + shfl.sync.bfly.b32 %r86, %r84, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p118, %r83, %r85; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p119, %r83, %r85; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p120, %r83, %r83; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p121, %r85, %r85; + setp.num.f32 %p122, %r85, %r85; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p127, %r84, %r86; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r87, %r83, %r85, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r88, %r84, %r86, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r89, %r87, 4, 31, -1; + shfl.sync.bfly.b32 %r90, %r88, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p130, %r87, %r89; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p131, %r87, %r89; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p132, %r87, %r87; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p133, %r89, %r89; + setp.num.f32 %p134, %r89, %r89; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p139, %r88, %r90; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r91, %r87, %r89, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r92, %r88, %r90, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r93, %r91, 2, 31, -1; + shfl.sync.bfly.b32 %r94, %r92, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p142, %r91, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p143, %r91, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p144, %r91, %r91; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p145, %r93, %r93; + setp.num.f32 %p146, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p151, %r92, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r95, %r91, %r93, %p153; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r96, %r92, %r94, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r97, %r95, 1, 31, -1; + shfl.sync.bfly.b32 %r98, %r96, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p154, %r95, %r97; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p155, %r95, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p156, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p157, %r97, %r97; + setp.num.f32 %p158, %r97, %r97; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p161, %p157, %p156; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p163, %r96, %r98; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r59, %r96, %r98, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.b32 %p62, %r70, 0; + shr.u32 %r99, %r68, 3; + and.b32 %r100, %r99, 60; + mov.b32 %r101, global_smem; + add.s32 %r56, %r101, %r100; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r57, %r95, %r97, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + // begin inline asm + @%p62 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + add.s32 %r102, %r101, 64; + add.s32 %r58, %r102, %r100; + // begin inline asm + @%p62 st.shared.b32 [ %r58 + 0 ], %r59; + // end inline asm + bar.sync 0; + setp.lt.u32 %p64, %r68, 16; + shl.b32 %r103, %r68, 2; + add.s32 %r61, %r101, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + add.s32 %r63, %r102, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r62, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r104, %r60, 8, 31, -1; + shfl.sync.bfly.b32 %r105, %r62, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p166, %r60, %r104; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p167, %r60, %r104; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p168, %r60, %r60; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p169, %r104, %r104; + setp.num.f32 %p170, %r104, %r104; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p173, %p168, %p169; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p175, %r62, %r105; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r106, %r60, %r104, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r107, %r62, %r105, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r108, %r106, 4, 31, -1; + shfl.sync.bfly.b32 %r109, %r107, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p178, %r106, %r108; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p179, %r106, %r108; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p180, %r106, %r106; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p181, %r108, %r108; + setp.num.f32 %p182, %r108, %r108; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p187, %r107, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r110, %r106, %r108, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r111, %r107, %r109, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r112, %r110, 2, 31, -1; + shfl.sync.bfly.b32 %r113, %r111, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p190, %r110, %r112; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p191, %r110, %r112; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p192, %r110, %r110; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p193, %r112, %r112; + setp.num.f32 %p194, %r112, %r112; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p199, %r111, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.f32 %r114, %r110, %r112, %p201; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r115, %r111, %r113, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + shfl.sync.bfly.b32 %r116, %r114, 1, 31, -1; + shfl.sync.bfly.b32 %r117, %r115, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.gt.f32 %p202, %r114, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.f32 %p203, %r114, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p204, %r114, %r114; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.nan.f32 %p205, %r116, %r116; + setp.num.f32 %p206, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p207, %p204, %p206; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p208, %p202, %p207; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p209, %p205, %p204; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p210, %p203, %p209; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.lt.s32 %p211, %r115, %r117; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + and.pred %p212, %p211, %p210; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + or.pred %p213, %p208, %p212; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r67, %r115, %r117, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + setp.eq.b32 %p66, %r68, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + selp.b32 %r65, %r114, %r116, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:45:75 ] + // begin inline asm + @%p66 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + // begin inline asm + @%p66 st.shared.b32 [ %r63 + 0 ], %r67; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd62, [global_smem+64]; +$L__tmp8: + .loc 1 47 25 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:47:25 + shl.b64 %rd64, %rd2, 3; + add.s64 %rd63, %rd19, %rd64; + .loc 1 47 36 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:47:36 + setp.eq.b32 %p214, %r1, 0; + and.pred %p68, %p214, %p3; + // begin inline asm + @%p68 st.global.b64 [ %rd63 + 0 ], { %rd62 }; + // end inline asm + .loc 1 47 4 // cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py:47:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 117 +.b8 100 +.b8 100 +.b8 104 +.b8 50 +.b8 105 +.b8 115 +.b8 117 +.b8 54 +.b8 52 +.b8 113 +.b8 120 +.b8 108 +.b8 119 +.b8 99 +.b8 106 +.b8 106 +.b8 118 +.b8 114 +.b8 115 +.b8 118 +.b8 102 +.b8 101 +.b8 105 +.b8 55 +.b8 107 +.b8 109 +.b8 100 +.b8 98 +.b8 110 +.b8 110 +.b8 116 +.b8 112 +.b8 113 +.b8 122 +.b8 116 +.b8 103 +.b8 55 +.b8 112 +.b8 121 +.b8 110 +.b8 105 +.b8 100 +.b8 110 +.b8 98 +.b8 107 +.b8 118 +.b8 117 +.b8 100 +.b8 113 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 97 +.b8 117 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..9bded20bc48723a378ad9297f32516a4aba98e9e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source @@ -0,0 +1,317 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 1 : i32 loc(#loc80) + %xoffset_2 = arith.constant 1 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x2048xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<1x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<1x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc95) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x2048xi64> loc(#loc95) + %tmp0_26 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc96) + %tmp0_27 = arith.muli %tmp0_26, %x1_12 : tensor<1x1xi64> loc(#loc96) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc97) + %tmp0_29 = arith.addi %tmp0_25, %tmp0_28 : tensor<1x2048xi64> loc(#loc97) + %tmp0_30 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc98) + %tmp0_31 = tt.addptr %tmp0_30, %tmp0_29 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc98) + %tmp0_32 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc99) + %tmp0_33 = arith.andi %r0_mask_19, %tmp0_32 : tensor<1x2048xi1> loc(#loc99) + %tmp0_34 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_35 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc100) + %tmp0_36 = tt.load %tmp0_31, %tmp0_33, %tmp0_35 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_16, %_tmp2_index_17, %tmp0_36, %r0_index_18) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc101) + %_tmp2_38 = arith.andi %r0_mask_19, %_tmp2_37 : tensor<1x2048xi1> loc(#loc101) + %_tmp2_39 = arith.select %_tmp2_38, %8#0, %_tmp2_16 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %_tmp2_index_40 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_41 = arith.andi %r0_mask_19, %_tmp2_index_40 : tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_42 = arith.select %_tmp2_index_41, %8#1, %_tmp2_index_17 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc104) + scf.yield %_tmp2_39, %_tmp2_index_42 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc35)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc35)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc35)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc35))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc114) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc114) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc115) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc129) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc117) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc130) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc52) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc54) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc54) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc59))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc61) + tt.return %4 : tensor<1x2048xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc63) + tt.return %5 : tensor<1x2048xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc68)), %index: tensor<1x2048xi32> loc("index"(#loc68))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc71) + %2 = ub.poison : tensor<1xi32> loc(#loc71) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a7f5eff988d7ec73437d9e83e1ca416a1637e9d8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":18:0) +#loc1 = loc(unknown) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":45:75) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("out_ptr0"(#loc)) +#loc43 = loc("ks0"(#loc)) +#loc44 = loc("ks1"(#loc)) +#loc45 = loc("xnumel"(#loc)) +#loc46 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc36)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc48) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc49) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc49) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %x0_5 = arith.remsi %x0, %ks0 : i64 loc(#loc50) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc51) + %tmp0 = arith.muli %x0_5, %c32000_i64 : i64 loc(#loc52) + %tmp0_6 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc81) + %tmp0_7 = arith.muli %ks1, %x1 : i64 loc(#loc54) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc56) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc83) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst, %_tmp2_index_11 = %cst_0) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc60) + %tmp0_13 = arith.extsi %r0_index_12 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_14 = arith.addi %tmp0_13, %tmp0_6 : tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc55) + %tmp0_16 = tt.addptr %tmp0_9, %tmp0_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.andi %r0_mask, %tmp0_10 : tensor<1x2048xi1, #blocked> loc(#loc57) + %tmp0_18 = tt.load %tmp0_16, %tmp0_17, %cst_2 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc107) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc108) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc87) + %b_isnan = arith.cmpf une, %tmp0_18, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc88) + %mask_19 = arith.xori %b_isnan, %cst_3 : tensor<1x2048xi1, #blocked> loc(#loc89) + %mask_20 = arith.andi %a_isnan, %mask_19 : tensor<1x2048xi1, #blocked> loc(#loc90) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc109) + %equal_22 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc92) + %equal_23 = arith.ori %equal, %equal_22 : tensor<1x2048xi1, #blocked> loc(#loc110) + %mask_24 = arith.cmpi slt, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi32, #blocked> loc(#loc94) + %mask_25 = arith.andi %equal_23, %mask_24 : tensor<1x2048xi1, #blocked> loc(#loc95) + %mask_26 = arith.ori %mask_21, %mask_25 : tensor<1x2048xi1, #blocked> loc(#loc96) + %6 = arith.select %mask_26, %_tmp2, %tmp0_18 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc74) + %7 = arith.select %mask_26, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc75) + %_tmp2_27 = arith.select %tmp0_17, %6, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc76) + %_tmp2_index_28 = arith.select %tmp0_17, %7, %_tmp2_index_11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_27, %_tmp2_index_28 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc34) + } loc(#loc84) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc36)), %arg7: i32 loc(callsite(#loc1 at #loc36)), %arg8: f32 loc(callsite(#loc1 at #loc36)), %arg9: i32 loc(callsite(#loc1 at #loc36))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc111) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc112) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc97) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc98) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc99) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc100) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc113) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc101) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc114) + %mask_16 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc102) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc103) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc104) + %6 = arith.select %mask_18, %arg6, %arg8 : f32 loc(#loc105) + %7 = arith.select %mask_18, %arg7, %arg9 : i32 loc(#loc106) + tt.reduce.return %6, %7 : f32, i32 loc(#loc78) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc80) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc38) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc39) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc39) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + %5 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc39) + tt.store %4, %3, %5 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":28:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:41) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:56) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:71) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":32:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":33:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":34:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:61) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":41:38) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":43:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:66) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:8) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":46:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:4) +#loc47 = loc("xoffset"(#loc2)) +#loc48 = loc("xmask"(#loc3)) +#loc49 = loc("r0_base"(#loc4)) +#loc50 = loc("x0"(#loc5)) +#loc51 = loc("x1"(#loc6)) +#loc52 = loc("tmp0"(#loc7)) +#loc53 = loc("tmp0"(#loc8)) +#loc54 = loc("tmp0"(#loc9)) +#loc55 = loc("tmp0"(#loc10)) +#loc56 = loc("tmp0"(#loc11)) +#loc57 = loc("tmp0"(#loc12)) +#loc58 = loc("_tmp2"(#loc13)) +#loc59 = loc("r0_index"(#loc14)) +#loc60 = loc("r0_mask"(#loc15)) +#loc61 = loc("tmp0"(#loc16)) +#loc62 = loc("mask"(#loc17)) +#loc63 = loc("equal"(#loc19)) +#loc64 = loc("a_isnan"(#loc20)) +#loc65 = loc("b_isnan"(#loc21)) +#loc66 = loc("mask"(#loc22)) +#loc67 = loc("mask"(#loc23)) +#loc68 = loc("mask"(#loc24)) +#loc69 = loc("equal"(#loc25)) +#loc70 = loc("equal"(#loc26)) +#loc71 = loc("mask"(#loc27)) +#loc72 = loc("mask"(#loc28)) +#loc73 = loc("mask"(#loc29)) +#loc74 = loc(callsite(#loc30 at #loc18)) +#loc75 = loc(callsite(#loc31 at #loc18)) +#loc76 = loc("_tmp2"(#loc32)) +#loc77 = loc("_tmp2_index"(#loc33)) +#loc78 = loc(callsite(#loc35 at #loc36)) +#loc80 = loc("tmp2"(#loc37)) +#loc81 = loc(fused[#loc53, #loc52]) +#loc82 = loc(fused[#loc55, #loc54]) +#loc83 = loc(fused[#loc57, #loc48]) +#loc84 = loc("_tmp2_index"(#loc58)) +#loc85 = loc("mask"(#loc62)) +#loc86 = loc("equal"(#loc63)) +#loc87 = loc(callsite(#loc64 at #loc18)) +#loc88 = loc(callsite(#loc65 at #loc18)) +#loc89 = loc(callsite(#loc66 at #loc18)) +#loc90 = loc(callsite(#loc67 at #loc18)) +#loc91 = loc("mask"(#loc68)) +#loc92 = loc(callsite(#loc69 at #loc18)) +#loc93 = loc("equal"(#loc70)) +#loc94 = loc(callsite(#loc71 at #loc18)) +#loc95 = loc(callsite(#loc72 at #loc18)) +#loc96 = loc(callsite(#loc73 at #loc18)) +#loc97 = loc(callsite(#loc64 at #loc78)) +#loc98 = loc(callsite(#loc65 at #loc78)) +#loc99 = loc(callsite(#loc66 at #loc78)) +#loc100 = loc(callsite(#loc67 at #loc78)) +#loc101 = loc(callsite(#loc69 at #loc78)) +#loc102 = loc(callsite(#loc71 at #loc78)) +#loc103 = loc(callsite(#loc72 at #loc78)) +#loc104 = loc(callsite(#loc73 at #loc78)) +#loc105 = loc(callsite(#loc30 at #loc78)) +#loc106 = loc(callsite(#loc31 at #loc78)) +#loc107 = loc(callsite(#loc85 at #loc18)) +#loc108 = loc(callsite(#loc86 at #loc18)) +#loc109 = loc(callsite(#loc91 at #loc18)) +#loc110 = loc(callsite(#loc93 at #loc18)) +#loc111 = loc(callsite(#loc85 at #loc78)) +#loc112 = loc(callsite(#loc86 at #loc78)) +#loc113 = loc(callsite(#loc91 at #loc78)) +#loc114 = loc(callsite(#loc93 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c4aa3db00df2af79c1b204746509439f38a584c9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,201 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc52) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc54) + %xmask_2 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc54) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc55) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc56) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc57) + %x0_4 = arith.remsi %x0, %ks0 : i64 loc(#loc57) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc58) + %_tmp2_index_5:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_6 = %_tmp2, %_tmp2_index_7 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc60) + %r0_index_8 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_1 : tensor<1x2048xi32> loc(#loc61) + %tmp0 = arith.muli %x0_4, %c32000_i64 : i64 loc(#loc62) + %tmp0_9 = arith.extsi %r0_index_8 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc63) + %tmp0_10 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc88) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x2048xi64> loc(#loc63) + %tmp0_12 = arith.muli %ks1, %x1 : i64 loc(#loc64) + %tmp0_13 = tt.splat %tmp0_12 : i64 -> tensor<1x2048xi64> loc(#loc89) + %tmp0_14 = arith.addi %tmp0_11, %tmp0_13 : tensor<1x2048xi64> loc(#loc65) + %tmp0_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc66) + %tmp0_16 = tt.addptr %tmp0_15, %tmp0_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc66) + %tmp0_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc90) + %tmp0_18 = arith.andi %r0_mask, %tmp0_17 : tensor<1x2048xi1> loc(#loc67) + %tmp0_19 = tt.load %tmp0_16, %tmp0_18, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc68) + %mask = arith.cmpf ogt, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc113) + %equal = arith.cmpf oeq, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc114) + %a_isnan = arith.cmpf une, %_tmp2_6, %_tmp2_6 : tensor<1x2048xf32> loc(#loc93) + %b_isnan = arith.cmpf une, %tmp0_19, %tmp0_19 : tensor<1x2048xf32> loc(#loc94) + %mask_20 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc95) + %mask_21 = arith.andi %a_isnan, %mask_20 : tensor<1x2048xi1> loc(#loc96) + %mask_22 = arith.ori %mask, %mask_21 : tensor<1x2048xi1> loc(#loc115) + %equal_23 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : tensor<1x2048xi1> loc(#loc116) + %mask_25 = arith.cmpi slt, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi32> loc(#loc100) + %mask_26 = arith.andi %equal_24, %mask_25 : tensor<1x2048xi1> loc(#loc101) + %mask_27 = arith.ori %mask_22, %mask_26 : tensor<1x2048xi1> loc(#loc102) + %4 = arith.select %mask_27, %_tmp2_6, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc81) + %5 = arith.select %mask_27, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc82) + %_tmp2_28 = arith.select %tmp0_18, %4, %_tmp2_6 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc83) + %_tmp2_index_29 = arith.select %tmp0_18, %5, %_tmp2_index_7 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc84) + scf.yield %_tmp2_28, %_tmp2_index_29 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc38) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index_5#0, %_tmp2_index_5#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc117) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc118) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc104) + %mask_6 = arith.xori %b_isnan, %true : i1 loc(#loc105) + %mask_7 = arith.andi %a_isnan, %mask_6 : i1 loc(#loc106) + %mask_8 = arith.ori %mask, %mask_7 : i1 loc(#loc119) + %equal_9 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc107) + %equal_10 = arith.ori %equal, %equal_9 : i1 loc(#loc120) + %mask_11 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc108) + %mask_12 = arith.andi %equal_10, %mask_11 : i1 loc(#loc109) + %mask_13 = arith.ori %mask_8, %mask_12 : i1 loc(#loc110) + %4 = arith.select %mask_13, %arg6, %arg8 : f32 loc(#loc111) + %5 = arith.select %mask_13, %arg7, %arg9 : i32 loc(#loc112) + tt.reduce.return %4, %5 : f32, i32 loc(#loc85) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc85) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc86) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc41) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc42) + tt.store %2, %3, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:47) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:56) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:52) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:71) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":38:61) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":41:38) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":43:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:66) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":44:8) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/au/cauddh2isu64qxlwcjjvrsvfei7kmdbnntpqztg7pynidnbkvudq.py":47:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xmask"(#loc7)) +#loc55 = loc("r0_base"(#loc8)) +#loc56 = loc("r0_base"(#loc9)) +#loc57 = loc("x0"(#loc10)) +#loc58 = loc("x1"(#loc11)) +#loc59 = loc("_tmp2"(#loc3)) +#loc60 = loc("r0_index"(#loc12)) +#loc61 = loc("r0_mask"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("tmp0"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("mask"(#loc21)) +#loc70 = loc("equal"(#loc23)) +#loc71 = loc("a_isnan"(#loc24)) +#loc72 = loc("b_isnan"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("equal"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc(callsite(#loc34 at #loc22)) +#loc82 = loc(callsite(#loc35 at #loc22)) +#loc83 = loc("_tmp2"(#loc36)) +#loc84 = loc("_tmp2_index"(#loc37)) +#loc85 = loc(callsite(#loc39 at #loc2)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc59)) +#loc88 = loc(fused[#loc63, #loc62]) +#loc89 = loc(fused[#loc65, #loc64]) +#loc90 = loc(fused[#loc67, #loc54]) +#loc91 = loc("mask"(#loc69)) +#loc92 = loc("equal"(#loc70)) +#loc93 = loc(callsite(#loc71 at #loc22)) +#loc94 = loc(callsite(#loc72 at #loc22)) +#loc95 = loc(callsite(#loc73 at #loc22)) +#loc96 = loc(callsite(#loc74 at #loc22)) +#loc97 = loc("mask"(#loc75)) +#loc98 = loc(callsite(#loc76 at #loc22)) +#loc99 = loc("equal"(#loc77)) +#loc100 = loc(callsite(#loc78 at #loc22)) +#loc101 = loc(callsite(#loc79 at #loc22)) +#loc102 = loc(callsite(#loc80 at #loc22)) +#loc103 = loc(callsite(#loc71 at #loc85)) +#loc104 = loc(callsite(#loc72 at #loc85)) +#loc105 = loc(callsite(#loc73 at #loc85)) +#loc106 = loc(callsite(#loc74 at #loc85)) +#loc107 = loc(callsite(#loc76 at #loc85)) +#loc108 = loc(callsite(#loc78 at #loc85)) +#loc109 = loc(callsite(#loc79 at #loc85)) +#loc110 = loc(callsite(#loc80 at #loc85)) +#loc111 = loc(callsite(#loc34 at #loc85)) +#loc112 = loc(callsite(#loc35 at #loc85)) +#loc113 = loc(callsite(#loc91 at #loc22)) +#loc114 = loc(callsite(#loc92 at #loc22)) +#loc115 = loc(callsite(#loc97 at #loc22)) +#loc116 = loc(callsite(#loc99 at #loc22)) +#loc117 = loc(callsite(#loc91 at #loc85)) +#loc118 = loc(callsite(#loc92 at #loc85)) +#loc119 = loc(callsite(#loc97 at #loc85)) +#loc120 = loc(callsite(#loc99 at #loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..49a668e36dd3b3de0cfd789db4a56cdce55862ce --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9c38d106c3ec54cdbd67e91413e6d4f072273c56 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8f23c2417ceaa38d431ce5344048336932539b5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "bc63a85e7982b83906634d26dbbbc5216e8d21115485d36241b2e6befa4a472c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..54c0ac1d2d58b75d57d72d44919f5e042f0e201d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir @@ -0,0 +1,346 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_2 = internal constant [8 x i8] c"unknown\00" +@assertFile_2 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py\00" +@assertMessage_2 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp36 < ks3\00" +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py\00" +@assertMessage_1 = internal constant [65 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py\00" +@assertMessage_0 = internal constant [64 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #1 !dbg !9 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %14 = shl i32 %13, 9, !dbg !11 + %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %16 = shl nuw nsw i32 %15, 1, !dbg !12 + %17 = and i32 %16, 510, !dbg !12 + %18 = or disjoint i32 %17, %14, !dbg !13 + %19 = or disjoint i32 %18, 1, !dbg !13 + %20 = icmp slt i32 %18, %9, !dbg !14 + %21 = icmp slt i32 %19, %9, !dbg !14 + %22 = sext i32 %18 to i64, !dbg !15 + %23 = sext i32 %19 to i64, !dbg !15 + %.frozen = freeze i64 %5, !dbg !16 + %24 = sdiv i64 %22, %.frozen, !dbg !16 + %25 = mul i64 %24, %.frozen, !dbg !15 + %.decomposed = sub i64 %22, %25, !dbg !15 + %.frozen25 = freeze i64 %5, !dbg !16 + %26 = sdiv i64 %23, %.frozen25, !dbg !16 + %27 = mul i64 %26, %.frozen25, !dbg !15 + %.decomposed26 = sub i64 %23, %27, !dbg !15 + %28 = srem i64 %24, %6, !dbg !17 + %29 = srem i64 %26, %6, !dbg !17 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18 + %31 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %33 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %30, i64 %32, i1 %20) #4, !dbg !19 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %35 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %31, i64 %34, i1 %21) #4, !dbg !19 + %36 = getelementptr i64, ptr addrspace(1) %1, i64 %28, !dbg !20 + %37 = getelementptr i64, ptr addrspace(1) %1, i64 %29, !dbg !20 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %38, i1 %20) #4, !dbg !21 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %40, i1 %21) #4, !dbg !21 + %42 = sdiv i64 %5, 2, !dbg !22 + %43 = icmp sge i64 %.decomposed, %42, !dbg !23 + %44 = icmp sge i64 %.decomposed26, %42, !dbg !23 + %45 = sub nsw i64 %22, %42, !dbg !24 + %46 = sub nsw i64 %23, %42, !dbg !24 + %47 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !25 + %48 = getelementptr bfloat, ptr addrspace(1) %0, i64 %46, !dbg !25 + %49 = and i1 %20, %43, !dbg !26 + %50 = and i1 %21, %44, !dbg !26 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %52 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %47, i64 %51, i1 %49) #4, !dbg !27 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %54 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %48, i64 %53, i1 %50) #4, !dbg !27 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %55, i1 %49) #4, !dbg !28 + %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %57, i1 %50) #4, !dbg !28 + %59 = icmp slt i64 %56, 0, !dbg !29 + %60 = icmp slt i64 %58, 0, !dbg !29 + %61 = select i1 %59, i64 %7, i64 0, !dbg !30 + %62 = add i64 %61, %56, !dbg !30 + %63 = select i1 %60, i64 %7, i64 0, !dbg !30 + %64 = add i64 %63, %58, !dbg !30 + %65 = icmp slt i64 %62, 0, !dbg !31 + %66 = icmp slt i64 %64, 0, !dbg !31 + %67 = icmp sge i64 %62, %7, !dbg !32 + %68 = icmp sge i64 %64, %7, !dbg !32 + %.not4 = or i1 %65, %67, !dbg !33 + %.not8 = or i1 %66, %68, !dbg !34 + %.not1 = and i1 %49, %.not4, !dbg !35 + %.not5 = and i1 %50, %.not8, !dbg !36 + %69 = or i1 %.not1, %.not5, !dbg !36 + br i1 %69, label %70, label %71, !dbg !36 + +70: ; preds = %12 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 36, ptr nonnull @assertFunc_0, i64 1), !dbg !36 + unreachable, !dbg !36 + +71: ; preds = %12 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %72 = sub nsw i64 %.decomposed, %42, !dbg !37 + %73 = sub nsw i64 %.decomposed26, %42, !dbg !37 + %74 = mul i64 %62, %5, !dbg !38 + %75 = mul i64 %64, %5, !dbg !38 + %76 = getelementptr bfloat, ptr addrspace(1) %2, i64 %72, !dbg !39 + %77 = getelementptr bfloat, ptr addrspace(1) %76, i64 %74, !dbg !39 + %78 = getelementptr bfloat, ptr addrspace(1) %2, i64 %73, !dbg !39 + %79 = getelementptr bfloat, ptr addrspace(1) %78, i64 %75, !dbg !39 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %77, i64 %80, i1 %49) #4, !dbg !40 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %79, i64 %82, i1 %50) #4, !dbg !40 + %84 = icmp slt i64 %.decomposed, %42, !dbg !41 + %85 = icmp slt i64 %.decomposed26, %42, !dbg !41 + %86 = add i64 %5, %22, !dbg !42 + %87 = add i64 %5, %23, !dbg !42 + %88 = sub i64 %86, %42, !dbg !43 + %89 = sub i64 %87, %42, !dbg !43 + %90 = getelementptr bfloat, ptr addrspace(1) %0, i64 %88, !dbg !44 + %91 = getelementptr bfloat, ptr addrspace(1) %0, i64 %89, !dbg !44 + %92 = and i1 %20, %84, !dbg !45 + %93 = and i1 %21, %85, !dbg !45 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %90, i64 %94, i1 %92) #4, !dbg !46 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %97 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %96, i1 %93) #4, !dbg !46 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %98, i1 %92) #4, !dbg !47 + %100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47 + %101 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %100, i1 %93) #4, !dbg !47 + %102 = icmp slt i64 %99, 0, !dbg !48 + %103 = icmp slt i64 %101, 0, !dbg !48 + %104 = select i1 %102, i64 %7, i64 0, !dbg !49 + %105 = add i64 %104, %99, !dbg !49 + %106 = select i1 %103, i64 %7, i64 0, !dbg !49 + %107 = add i64 %106, %101, !dbg !49 + %108 = icmp slt i64 %105, 0, !dbg !50 + %109 = icmp slt i64 %107, 0, !dbg !50 + %110 = icmp sge i64 %105, %7, !dbg !51 + %111 = icmp sge i64 %107, %7, !dbg !51 + %.not12 = or i1 %108, %110, !dbg !52 + %.not16 = or i1 %109, %111, !dbg !53 + %.not9 = and i1 %92, %.not12, !dbg !54 + %.not13 = and i1 %93, %.not16, !dbg !55 + %112 = or i1 %.not9, %.not13, !dbg !55 + br i1 %112, label %113, label %114, !dbg !55 + +113: ; preds = %71 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 51, ptr nonnull @assertFunc_1, i64 1), !dbg !55 + unreachable, !dbg !55 + +114: ; preds = %71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %115 = sub i64 %5, %42, !dbg !56 + %116 = mul i64 %105, %5, !dbg !57 + %117 = mul i64 %107, %5, !dbg !57 + %118 = getelementptr bfloat, ptr addrspace(1) %2, i64 %115, !dbg !58 + %119 = getelementptr bfloat, ptr addrspace(1) %118, i64 %.decomposed, !dbg !58 + %120 = getelementptr bfloat, ptr addrspace(1) %119, i64 %116, !dbg !58 + %121 = getelementptr bfloat, ptr addrspace(1) %118, i64 %.decomposed26, !dbg !58 + %122 = getelementptr bfloat, ptr addrspace(1) %121, i64 %117, !dbg !58 + %123 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !59 + %124 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %120, i64 %123, i1 %92) #4, !dbg !59 + %125 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !59 + %126 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %122, i64 %125, i1 %93) #4, !dbg !59 + %127 = icmp slt i64 %39, 0, !dbg !60 + %128 = icmp slt i64 %41, 0, !dbg !60 + %129 = select i1 %127, i64 %8, i64 0, !dbg !61 + %130 = add i64 %129, %39, !dbg !61 + %131 = select i1 %128, i64 %8, i64 0, !dbg !61 + %132 = add i64 %131, %41, !dbg !61 + %133 = icmp slt i64 %130, 0, !dbg !62 + %134 = icmp slt i64 %132, 0, !dbg !62 + %135 = icmp sge i64 %130, %8, !dbg !63 + %136 = icmp sge i64 %132, %8, !dbg !63 + %.not20 = or i1 %133, %135, !dbg !64 + %.not24 = or i1 %134, %136, !dbg !65 + %.not17 = and i1 %20, %.not20, !dbg !66 + %.not21 = and i1 %21, %.not24, !dbg !67 + %137 = or i1 %.not17, %.not21, !dbg !67 + br i1 %137, label %138, label %139, !dbg !67 + +138: ; preds = %114 + tail call void @__assertfail(ptr nonnull @assertMessage_2, ptr nonnull @assertFile_2, i32 62, ptr nonnull @assertFunc_2, i64 1), !dbg !67 + unreachable, !dbg !67 + +139: ; preds = %114 + %140 = bitcast i16 %54 to bfloat, !dbg !27 + %141 = fpext bfloat %140 to float, !dbg !68 + %142 = bitcast i16 %83 to bfloat, !dbg !40 + %143 = fpext bfloat %142 to float, !dbg !69 + %144 = fmul float %141, %143, !dbg !70 + %145 = fsub float 0.000000e+00, %144, !dbg !71 + %146 = select i1 %44, float %145, float 0.000000e+00, !dbg !72 + %147 = bitcast i16 %97 to bfloat, !dbg !46 + %148 = fpext bfloat %147 to float, !dbg !73 + %149 = bitcast i16 %126 to bfloat, !dbg !59 + %150 = fpext bfloat %149 to float, !dbg !74 + %151 = fmul float %148, %150, !dbg !75 + %152 = select i1 %85, float %151, float 0.000000e+00, !dbg !72 + %153 = fadd float %146, %152, !dbg !76 + %154 = bitcast i16 %52 to bfloat, !dbg !27 + %155 = fpext bfloat %154 to float, !dbg !68 + %156 = bitcast i16 %81 to bfloat, !dbg !40 + %157 = fpext bfloat %156 to float, !dbg !69 + %158 = fmul float %155, %157, !dbg !70 + %159 = fsub float 0.000000e+00, %158, !dbg !71 + %160 = select i1 %43, float %159, float 0.000000e+00, !dbg !72 + %161 = bitcast i16 %95 to bfloat, !dbg !46 + %162 = fpext bfloat %161 to float, !dbg !73 + %163 = bitcast i16 %124 to bfloat, !dbg !59 + %164 = fpext bfloat %163 to float, !dbg !74 + %165 = fmul float %162, %164, !dbg !75 + %166 = select i1 %84, float %165, float 0.000000e+00, !dbg !72 + %167 = fadd float %160, %166, !dbg !76 + %168 = bitcast i16 %35 to bfloat, !dbg !19 + %169 = fpext bfloat %168 to float, !dbg !77 + %170 = bitcast i16 %33 to bfloat, !dbg !19 + %171 = fpext bfloat %170 to float, !dbg !77 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !67 + %172 = mul i64 %130, %5, !dbg !78 + %173 = mul i64 %132, %5, !dbg !78 + %174 = getelementptr bfloat, ptr addrspace(1) %3, i64 %.decomposed, !dbg !79 + %175 = getelementptr bfloat, ptr addrspace(1) %174, i64 %172, !dbg !79 + %176 = getelementptr bfloat, ptr addrspace(1) %3, i64 %.decomposed26, !dbg !79 + %177 = getelementptr bfloat, ptr addrspace(1) %176, i64 %173, !dbg !79 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !80 + %179 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %175, i64 %178, i1 %20) #4, !dbg !80 + %180 = bitcast i16 %179 to bfloat, !dbg !80 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !80 + %182 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %177, i64 %181, i1 %21) #4, !dbg !80 + %183 = bitcast i16 %182 to bfloat, !dbg !80 + %184 = fpext bfloat %180 to float, !dbg !81 + %185 = fpext bfloat %183 to float, !dbg !81 + %186 = fmul float %171, %184, !dbg !82 + %187 = fmul float %169, %185, !dbg !82 + %188 = fadd float %167, %186, !dbg !83 + %189 = fadd float %153, %187, !dbg !83 + %190 = getelementptr bfloat, ptr addrspace(1) %4, i64 %22, !dbg !84 + %191 = getelementptr bfloat, ptr addrspace(1) %4, i64 %23, !dbg !84 + %192 = fptrunc float %188 to bfloat, !dbg !85 + %193 = fptrunc float %189 to bfloat, !dbg !85 + %194 = bitcast bfloat %192 to i16, !dbg !85 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %194, ptr addrspace(1) %190, i1 %20) #4, !dbg !85 + %195 = bitcast bfloat %193 to i16, !dbg !85 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %195, ptr addrspace(1) %191, i1 %21) #4, !dbg !85 + ret void, !dbg !86 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0", linkageName: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 21, column: 21, scope: !9) +!15 = !DILocation(line: 22, column: 19, scope: !9) +!16 = !DILocation(line: 24, column: 21, scope: !9) +!17 = !DILocation(line: 24, column: 28, scope: !9) +!18 = !DILocation(line: 25, column: 31, scope: !9) +!19 = !DILocation(line: 25, column: 36, scope: !9) +!20 = !DILocation(line: 26, column: 31, scope: !9) +!21 = !DILocation(line: 26, column: 36, scope: !9) +!22 = !DILocation(line: 28, column: 18, scope: !9) +!23 = !DILocation(line: 29, column: 19, scope: !9) +!24 = !DILocation(line: 30, column: 35, scope: !9) +!25 = !DILocation(line: 30, column: 30, scope: !9) +!26 = !DILocation(line: 30, column: 60, scope: !9) +!27 = !DILocation(line: 30, column: 53, scope: !9) +!28 = !DILocation(line: 31, column: 35, scope: !9) +!29 = !DILocation(line: 34, column: 18, scope: !9) +!30 = !DILocation(line: 35, column: 32, scope: !9) +!31 = !DILocation(line: 36, column: 28, scope: !9) +!32 = !DILocation(line: 36, column: 98, scope: !9) +!33 = !DILocation(line: 36, column: 64, scope: !9) +!34 = !DILocation(line: 36, column: 108, scope: !9) +!35 = !DILocation(line: 36, column: 106, scope: !9) +!36 = !DILocation(line: 36, column: 123, scope: !9) +!37 = !DILocation(line: 37, column: 36, scope: !9) +!38 = !DILocation(line: 37, column: 58, scope: !9) +!39 = !DILocation(line: 37, column: 31, scope: !9) +!40 = !DILocation(line: 37, column: 65, scope: !9) +!41 = !DILocation(line: 44, column: 19, scope: !9) +!42 = !DILocation(line: 45, column: 37, scope: !9) +!43 = !DILocation(line: 45, column: 42, scope: !9) +!44 = !DILocation(line: 45, column: 31, scope: !9) +!45 = !DILocation(line: 45, column: 68, scope: !9) +!46 = !DILocation(line: 45, column: 60, scope: !9) +!47 = !DILocation(line: 46, column: 36, scope: !9) +!48 = !DILocation(line: 49, column: 20, scope: !9) +!49 = !DILocation(line: 50, column: 35, scope: !9) +!50 = !DILocation(line: 51, column: 28, scope: !9) +!51 = !DILocation(line: 51, column: 100, scope: !9) +!52 = !DILocation(line: 51, column: 65, scope: !9) +!53 = !DILocation(line: 51, column: 110, scope: !9) +!54 = !DILocation(line: 51, column: 108, scope: !9) +!55 = !DILocation(line: 51, column: 126, scope: !9) +!56 = !DILocation(line: 52, column: 37, scope: !9) +!57 = !DILocation(line: 52, column: 64, scope: !9) +!58 = !DILocation(line: 52, column: 31, scope: !9) +!59 = !DILocation(line: 52, column: 72, scope: !9) +!60 = !DILocation(line: 60, column: 20, scope: !9) +!61 = !DILocation(line: 61, column: 35, scope: !9) +!62 = !DILocation(line: 62, column: 28, scope: !9) +!63 = !DILocation(line: 62, column: 46, scope: !9) +!64 = !DILocation(line: 62, column: 38, scope: !9) +!65 = !DILocation(line: 62, column: 56, scope: !9) +!66 = !DILocation(line: 62, column: 54, scope: !9) +!67 = !DILocation(line: 62, column: 64, scope: !9) +!68 = !DILocation(line: 30, column: 111, scope: !9) +!69 = !DILocation(line: 37, column: 123, scope: !9) +!70 = !DILocation(line: 38, column: 19, scope: !9) +!71 = !DILocation(line: 39, column: 13, scope: !9) +!72 = !DILocation(line: 0, scope: !9) +!73 = !DILocation(line: 45, column: 119, scope: !9) +!74 = !DILocation(line: 52, column: 131, scope: !9) +!75 = !DILocation(line: 53, column: 20, scope: !9) +!76 = !DILocation(line: 57, column: 20, scope: !9) +!77 = !DILocation(line: 25, column: 76, scope: !9) +!78 = !DILocation(line: 63, column: 40, scope: !9) +!79 = !DILocation(line: 63, column: 31, scope: !9) +!80 = !DILocation(line: 63, column: 48, scope: !9) +!81 = !DILocation(line: 63, column: 88, scope: !9) +!82 = !DILocation(line: 64, column: 20, scope: !9) +!83 = !DILocation(line: 65, column: 20, scope: !9) +!84 = !DILocation(line: 66, column: 25, scope: !9) +!85 = !DILocation(line: 66, column: 37, scope: !9) +!86 = !DILocation(line: 66, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ad480308f0272ddfe2517e7cccab014eeb388456 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx @@ -0,0 +1,759 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 100, 102, 47, 99, 100, 102, 98, 54, 99, 103, 101, 110, 122, 115, 106, 117, 53, 99, 113, 118, 121, 52, 50, 52, 52, 120, 104, 52, 120, 105, 100, 110, 105, 121, 101, 122, 110, 118, 107, 117, 98, 118, 100, 103, 50, 109, 103, 54, 100, 53, 111, 99, 54, 120, 116, 46, 112, 121}; +.global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51}; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 100, 102, 47, 99, 100, 102, 98, 54, 99, 103, 101, 110, 122, 115, 106, 117, 53, 99, 113, 118, 121, 52, 50, 52, 52, 120, 104, 52, 120, 105, 100, 110, 105, 121, 101, 122, 110, 118, 107, 117, 98, 118, 100, 103, 50, 109, 103, 54, 100, 53, 111, 99, 54, 120, 116, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 100, 102, 47, 99, 100, 102, 98, 54, 99, 103, 101, 110, 122, 115, 106, 117, 53, 99, 113, 118, 121, 52, 50, 52, 52, 120, 104, 52, 120, 105, 100, 110, 105, 121, 101, 122, 110, 118, 107, 117, 98, 118, 100, 103, 50, 109, 103, 54, 100, 53, 111, 99, 54, 120, 116, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8, + .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_11 +) +.reqntid 256 +{ + .reg .pred %p<71>; + .reg .b16 %rs<33>; + .reg .b32 %r<57>; + .reg .b64 %rd<189>; + .loc 1 18 0 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:18:0 + +// %bb.0: + ld.param.b64 %rd40, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:19:33 + shl.b32 %r3, %r2, 9; + .loc 1 20 36 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 23 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 22 19 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:22:19 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + .loc 1 24 21 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:24:21 + or.b64 %rd41, %rd1, %rd40; + and.b64 %rd42, %rd41, -4294967296; + setp.ne.b64 %p5, %rd42, 0; + cvt.u32.u64 %r56, %rd1; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd185, %rd1, %rd40; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd40; + div.u32 %r11, %r56, %r9; + cvt.u64.u32 %rd185, %r11; +$L__BB0_3: + .loc 1 0 21 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0:21 + ld.param.b64 %rd37, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6]; + .loc 1 24 21 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:24:21 + or.b64 %rd44, %rd2, %rd40; + and.b64 %rd45, %rd44, -4294967296; + setp.ne.b64 %p6, %rd45, 0; + cvt.u32.u64 %r55, %rd2; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd186, %rd2, %rd40; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd40; + div.u32 %r14, %r55, %r12; + cvt.u64.u32 %rd186, %r14; +$L__BB0_6: + .loc 1 22 19 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:22:19 + mul.lo.s64 %rd43, %rd185, %rd40; + mul.lo.s64 %rd11, %rd186, %rd40; + .loc 1 24 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:24:28 + or.b64 %rd46, %rd185, %rd37; + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p7, %rd47, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd187, %rd185, %rd37; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd37; + cvt.u32.u64 %r16, %rd185; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd187, %r17; +$L__BB0_9: + .loc 1 0 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0:28 + ld.param.b32 %r1, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9]; + ld.param.b64 %rd38, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd32, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd31, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0]; + sub.s64 %rd7, %rd1, %rd43; + sub.s64 %rd12, %rd2, %rd11; + .loc 1 24 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:24:28 + or.b64 %rd48, %rd186, %rd37; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p8, %rd49, 0; + @%p8 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd188, %rd186, %rd37; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd37; + cvt.u32.u64 %r19, %rd186; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd188, %r20; +$L__BB0_12: + .loc 1 21 21 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:21:21 + setp.lt.s32 %p12, %r55, %r1; + setp.lt.s32 %p11, %r56, %r1; + .loc 1 25 31 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:25:31 + shl.b64 %rd78, %rd1, 1; + add.s64 %rd51, %rd31, %rd78; + add.s64 %rd54, %rd51, 2; + .loc 1 25 36 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:25:36 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd54 + 0 ], %rd53; + // end inline asm + .loc 1 26 31 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:26:31 + shl.b64 %rd79, %rd187, 3; + add.s64 %rd72, %rd32, %rd79; + shl.b64 %rd80, %rd188, 3; + add.s64 %rd76, %rd32, %rd80; + .loc 1 26 36 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:26:36 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd57 }, [ %rd72 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd61 }, [ %rd76 + 0 ], %rd60; + // end inline asm + .loc 1 28 18 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:28:18 + shr.u64 %rd81, %rd40, 63; + add.s64 %rd82, %rd40, %rd81; + shr.s64 %rd23, %rd82, 1; + .loc 1 29 19 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:29:19 + setp.ge.s64 %p17, %rd7, %rd23; + setp.ge.s64 %p18, %rd12, %rd23; + .loc 1 30 35 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:30:35 + sub.s64 %rd83, %rd1, %rd23; + .loc 1 30 30 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:30:30 + shl.b64 %rd84, %rd83, 1; + add.s64 %rd65, %rd31, %rd84; + add.s64 %rd68, %rd65, 2; + .loc 1 30 60 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:30:60 + and.pred %p15, %p11, %p17; + and.pred %p16, %p12, %p18; + .loc 1 30 53 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:30:53 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + mov.b16 %rs16, 0; + // begin inline asm + mov.u16 %rs13, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd68 + 0 ], %rd67; + // end inline asm + .loc 1 31 35 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:31:35 + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd71 }, [ %rd72 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd75 }, [ %rd76 + 0 ], %rd74; + // end inline asm + .loc 1 35 32 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:35:32 + shr.s64 %rd85, %rd71, 63; + and.b64 %rd86, %rd85, %rd38; + add.s64 %rd24, %rd86, %rd71; + shr.s64 %rd87, %rd75, 63; + and.b64 %rd88, %rd87, %rd38; + add.s64 %rd25, %rd88, %rd75; + .loc 1 36 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:28 + setp.lt.s64 %p19, %rd24, 0; + setp.lt.s64 %p20, %rd25, 0; + .loc 1 36 98 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:98 + setp.ge.s64 %p21, %rd24, %rd38; + setp.ge.s64 %p22, %rd25, %rd38; + .loc 1 36 64 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:64 + or.pred %p23, %p19, %p21; + .loc 1 36 108 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:108 + or.pred %p24, %p20, %p22; + .loc 1 36 106 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:106 + and.pred %p25, %p15, %p23; + .loc 1 36 123 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:123 + and.pred %p26, %p16, %p24; + or.pred %p27, %p25, %p26; + not.pred %p28, %p27; + @%p28 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 0 123 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0:123 + ld.param.b64 %rd33, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2]; + .loc 1 36 123 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:123 + bar.sync 0; + .loc 1 37 36 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:37:36 + sub.s64 %rd109, %rd7, %rd23; + .loc 1 37 58 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:37:58 + mul.lo.s64 %rd110, %rd24, %rd40; + mul.lo.s64 %rd111, %rd25, %rd40; + .loc 1 37 31 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:37:31 + shl.b64 %rd112, %rd109, 1; + add.s64 %rd113, %rd33, %rd112; + shl.b64 %rd114, %rd110, 1; + add.s64 %rd90, %rd113, %rd114; + sub.s64 %rd26, %rd1, %rd11; + sub.s64 %rd115, %rd26, %rd23; + shl.b64 %rd116, %rd115, 1; + add.s64 %rd117, %rd33, %rd116; + shl.b64 %rd118, %rd111, 1; + add.s64 %rd119, %rd117, %rd118; + add.s64 %rd93, %rd119, 2; + .loc 1 37 65 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:37:65 + // begin inline asm + mov.u64 %rd89, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd90 + 0 ], %rd89; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd93 + 0 ], %rd92; + // end inline asm + .loc 1 44 19 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:44:19 + setp.lt.s64 %p37, %rd7, %rd23; + setp.lt.s64 %p38, %rd12, %rd23; + .loc 1 45 37 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:37 + add.s64 %rd120, %rd40, %rd1; + .loc 1 45 42 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:42 + sub.s64 %rd121, %rd120, %rd23; + .loc 1 45 31 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:31 + shl.b64 %rd122, %rd121, 1; + add.s64 %rd96, %rd31, %rd122; + add.s64 %rd99, %rd96, 2; + .loc 1 45 68 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:68 + and.pred %p33, %p11, %p37; + and.pred %p34, %p12, %p38; + .loc 1 45 60 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:60 + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs16; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd96 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd98, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd98, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs16; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd99 + 0 ], %rd98; + // end inline asm + .loc 1 46 36 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:46:36 + // begin inline asm + mov.u64 %rd101, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd101, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd102 }, [ %rd72 + 0 ], %rd101; + // end inline asm + // begin inline asm + mov.u64 %rd105, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd105, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd106, 0x0; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd106 }, [ %rd76 + 0 ], %rd105; + // end inline asm + .loc 1 50 35 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:50:35 + shr.s64 %rd123, %rd102, 63; + and.b64 %rd124, %rd123, %rd38; + add.s64 %rd27, %rd124, %rd102; + shr.s64 %rd125, %rd106, 63; + and.b64 %rd126, %rd125, %rd38; + add.s64 %rd28, %rd126, %rd106; + .loc 1 51 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:28 + setp.lt.s64 %p39, %rd27, 0; + setp.lt.s64 %p40, %rd28, 0; + .loc 1 51 100 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:100 + setp.ge.s64 %p41, %rd27, %rd38; + setp.ge.s64 %p42, %rd28, %rd38; + .loc 1 51 65 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:65 + or.pred %p43, %p39, %p41; + .loc 1 51 110 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:110 + or.pred %p44, %p40, %p42; + .loc 1 51 108 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:108 + and.pred %p45, %p33, %p43; + .loc 1 51 126 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:126 + and.pred %p46, %p34, %p44; + or.pred %p47, %p45, %p46; + not.pred %p48, %p47; + @%p48 bra $L__BB0_16; + bra.uni $L__BB0_15; +$L__BB0_16: + .loc 1 0 126 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0:126 + ld.param.b64 %rd39, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8]; + .loc 1 51 126 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:126 + bar.sync 0; + .loc 1 52 37 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:52:37 + sub.s64 %rd133, %rd40, %rd23; + .loc 1 52 64 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:52:64 + mul.lo.s64 %rd134, %rd27, %rd40; + mul.lo.s64 %rd135, %rd28, %rd40; + .loc 1 52 31 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:52:31 + shl.b64 %rd136, %rd133, 1; + add.s64 %rd137, %rd33, %rd136; + shl.b64 %rd138, %rd7, 1; + add.s64 %rd139, %rd137, %rd138; + shl.b64 %rd140, %rd134, 1; + add.s64 %rd128, %rd139, %rd140; + shl.b64 %rd141, %rd26, 1; + add.s64 %rd142, %rd137, %rd141; + shl.b64 %rd143, %rd135, 1; + add.s64 %rd144, %rd142, %rd143; + add.s64 %rd131, %rd144, 2; + .loc 1 52 72 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:52:72 + // begin inline asm + mov.u64 %rd127, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd127, 1.0; + // end inline asm + mov.b16 %rs26, 0; + // begin inline asm + mov.u16 %rs25, %rs26; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd128 + 0 ], %rd127; + // end inline asm + // begin inline asm + mov.u64 %rd130, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd130, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs26; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd131 + 0 ], %rd130; + // end inline asm + .loc 1 61 35 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:61:35 + shr.s64 %rd145, %rd57, 63; + and.b64 %rd146, %rd145, %rd39; + add.s64 %rd29, %rd146, %rd57; + shr.s64 %rd147, %rd61, 63; + and.b64 %rd148, %rd147, %rd39; + add.s64 %rd30, %rd148, %rd61; + .loc 1 62 28 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:28 + setp.lt.s64 %p53, %rd29, 0; + setp.lt.s64 %p54, %rd30, 0; + .loc 1 62 46 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:46 + setp.ge.s64 %p55, %rd29, %rd39; + setp.ge.s64 %p56, %rd30, %rd39; + .loc 1 62 38 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:38 + or.pred %p57, %p53, %p55; + .loc 1 62 56 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:56 + or.pred %p58, %p54, %p56; + .loc 1 62 54 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:54 + and.pred %p59, %p11, %p57; + .loc 1 62 64 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:64 + and.pred %p60, %p12, %p58; + or.pred %p61, %p59, %p60; + not.pred %p62, %p61; + @%p62 bra $L__BB0_18; + bra.uni $L__BB0_17; +$L__BB0_18: + .loc 1 0 64 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0:64 + ld.param.b64 %rd35, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd34, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3]; + .loc 1 30 111 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:30:111 + cvt.f32.bf16 %r29, %rs15; + .loc 1 37 123 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:37:123 + cvt.f32.bf16 %r30, %rs19; + .loc 1 39 13 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:39:13 + neg.f32 %r31, %r29; + fma.rn.f32 %r32, %r31, %r30, 0f00000000; + .loc 1 0 0 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0 + selp.f32 %r33, %r32, 0f00000000, %p18; + .loc 1 45 119 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:119 + cvt.f32.bf16 %r34, %rs23; + .loc 1 52 131 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:52:131 + cvt.f32.bf16 %r35, %rs27; + .loc 1 53 20 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:53:20 + mul.f32 %r36, %r34, %r35; + .loc 1 0 0 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0 + selp.f32 %r37, %r36, 0f00000000, %p38; + .loc 1 57 20 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:57:20 + add.f32 %r38, %r33, %r37; + .loc 1 30 111 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:30:111 + cvt.f32.bf16 %r39, %rs13; + .loc 1 37 123 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:37:123 + cvt.f32.bf16 %r40, %rs17; + .loc 1 39 13 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:39:13 + neg.f32 %r41, %r39; + fma.rn.f32 %r42, %r41, %r40, 0f00000000; + .loc 1 0 0 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0 + selp.f32 %r43, %r42, 0f00000000, %p17; + .loc 1 45 119 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:45:119 + cvt.f32.bf16 %r44, %rs21; + .loc 1 52 131 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:52:131 + cvt.f32.bf16 %r45, %rs25; + .loc 1 53 20 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:53:20 + mul.f32 %r46, %r44, %r45; + .loc 1 0 0 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:0 + selp.f32 %r47, %r46, 0f00000000, %p37; + .loc 1 57 20 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:57:20 + add.f32 %r48, %r43, %r47; + .loc 1 25 76 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:25:76 + cvt.f32.bf16 %r49, %rs12; + cvt.f32.bf16 %r50, %rs11; + .loc 1 62 64 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:64 + bar.sync 0; + .loc 1 63 40 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:63:40 + mul.lo.s64 %rd157, %rd29, %rd40; + mul.lo.s64 %rd158, %rd30, %rd40; + .loc 1 63 31 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:63:31 + add.s64 %rd160, %rd34, %rd138; + shl.b64 %rd161, %rd157, 1; + add.s64 %rd150, %rd160, %rd161; + add.s64 %rd163, %rd34, %rd141; + shl.b64 %rd164, %rd158, 1; + add.s64 %rd165, %rd163, %rd164; + add.s64 %rd153, %rd165, 2; + .loc 1 63 48 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:63:48 + // begin inline asm + mov.u64 %rd151, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd151, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd150 + 0 ], %rd151; + // end inline asm + // begin inline asm + mov.u64 %rd154, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd154, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd153 + 0 ], %rd154; + // end inline asm + .loc 1 63 88 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:63:88 + cvt.f32.bf16 %r51, %rs29; + cvt.f32.bf16 %r52, %rs30; + .loc 1 65 20 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:65:20 + fma.rn.f32 %r53, %r50, %r51, %r48; + fma.rn.f32 %r54, %r49, %r52, %r38; + .loc 1 66 25 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:66:25 + add.s64 %rd155, %rd35, %rd78; + add.s64 %rd156, %rd155, 2; + .loc 1 66 37 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:66:37 + cvt.rn.bf16.f32 %rs31, %r53; + cvt.rn.bf16.f32 %rs32, %r54; + // begin inline asm + @%p11 st.global.b16 [ %rd155 + 0 ], { %rs31 }; + // end inline asm + // begin inline asm + @%p12 st.global.b16 [ %rd156 + 0 ], { %rs32 }; + // end inline asm + .loc 1 66 4 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:66:4 + ret; +$L__BB0_13: + .loc 1 36 123 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:36:123 + { // callseq 2, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd179, assertFunc_0; + cvta.global.u64 %rd180, %rd179; + st.param.b64 [param3], %rd180; + mov.b64 %rd181, assertFile_0; + cvta.global.u64 %rd182, %rd181; + st.param.b64 [param1], %rd182; + mov.b64 %rd183, assertMessage_0; + cvta.global.u64 %rd184, %rd183; + st.param.b64 [param0], %rd184; + st.param.b64 [param4], 1; + st.param.b32 [param2], 36; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 2 + trap; +$L__BB0_15: + .loc 1 51 126 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:51:126 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd173, assertFunc_1; + cvta.global.u64 %rd174, %rd173; + st.param.b64 [param3], %rd174; + mov.b64 %rd175, assertFile_1; + cvta.global.u64 %rd176, %rd175; + st.param.b64 [param1], %rd176; + mov.b64 %rd177, assertMessage_1; + cvta.global.u64 %rd178, %rd177; + st.param.b64 [param0], %rd178; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_17: + .loc 1 62 64 // cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py:62:64 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd167, assertFunc_2; + cvta.global.u64 %rd168, %rd167; + st.param.b64 [param3], %rd168; + mov.b64 %rd169, assertFile_2; + cvta.global.u64 %rd170, %rd169; + st.param.b64 [param1], %rd170; + mov.b64 %rd171, assertMessage_2; + cvta.global.u64 %rd172, %rd171; + st.param.b64 [param0], %rd172; + st.param.b64 [param4], 1; + st.param.b32 [param2], 62; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 102 +.b8 98 +.b8 54 +.b8 99 +.b8 103 +.b8 101 +.b8 110 +.b8 122 +.b8 115 +.b8 106 +.b8 117 +.b8 53 +.b8 99 +.b8 113 +.b8 118 +.b8 121 +.b8 52 +.b8 50 +.b8 52 +.b8 52 +.b8 120 +.b8 104 +.b8 52 +.b8 120 +.b8 105 +.b8 100 +.b8 110 +.b8 105 +.b8 121 +.b8 101 +.b8 122 +.b8 110 +.b8 118 +.b8 107 +.b8 117 +.b8 98 +.b8 118 +.b8 100 +.b8 103 +.b8 50 +.b8 109 +.b8 103 +.b8 54 +.b8 100 +.b8 53 +.b8 111 +.b8 99 +.b8 54 +.b8 120 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 102 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..76fd347883c549a9a056a4405063a3c8a9c18640 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source @@ -0,0 +1,419 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":18:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("in_ptr3"(#loc)) +#loc113 = loc("out_ptr0"(#loc)) +#loc114 = loc("ks0"(#loc)) +#loc115 = loc("ks1"(#loc)) +#loc116 = loc("ks2"(#loc)) +#loc117 = loc("ks3"(#loc)) +#loc118 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc119) + %xoffset_0 = arith.constant 512 : i32 loc(#loc120) + %xoffset_1 = arith.constant 512 : i32 loc(#loc120) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc120) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc121) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc122) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc123) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc123) + %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc124) + %x0_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc124) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<512xi64> loc(#loc124) + %x1 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc125) + %x1_8 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc125) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<512xi64> loc(#loc125) + %x1_10 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc126) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<512xi64> loc(#loc126) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc127) + %tmp31_12 = tt.addptr %tmp31, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc127) + %tmp31_13 = tt.load %tmp31_12, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc128) + %tmp31_14 = arith.extf %tmp31_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc129) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp32_15 = tt.addptr %tmp32, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp32_16 = tt.load %tmp32_15, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp1 = arith.constant 2 : i32 loc(#loc132) + %tmp1_17 = arith.constant 2 : i64 loc(#loc132) + %tmp1_18 = arith.divsi %ks0, %tmp1_17 : i64 loc(#loc132) + %tmp2 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc133) + %tmp2_19 = arith.cmpi sge, %x0_7, %tmp2 : tensor<512xi64> loc(#loc133) + %tmp3 = arith.constant 2 : i32 loc(#loc134) + %tmp3_20 = arith.constant 2 : i64 loc(#loc134) + %tmp3_21 = arith.divsi %ks0, %tmp3_20 : i64 loc(#loc134) + %tmp3_22 = arith.constant -1 : i32 loc(#loc135) + %tmp3_23 = arith.constant -1 : i64 loc(#loc135) + %tmp3_24 = arith.muli %tmp3_23, %tmp3_21 : i64 loc(#loc135) + %tmp3_25 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc136) + %tmp3_26 = tt.splat %tmp3_24 : i64 -> tensor<512xi64> loc(#loc136) + %tmp3_27 = arith.addi %tmp3_25, %tmp3_26 : tensor<512xi64> loc(#loc136) + %tmp3_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc137) + %tmp3_29 = tt.addptr %tmp3_28, %tmp3_27 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc137) + %tmp3_30 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc138) + %tmp3_31 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp3_32 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc139) + %tmp3_33 = arith.truncf %tmp3_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc139) + %tmp3_34 = tt.load %tmp3_29, %tmp3_30, %tmp3_33 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc139) + %tmp3_35 = arith.extf %tmp3_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc140) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc141) + %tmp4_36 = tt.addptr %tmp4, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc141) + %tmp4_37 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc142) + %tmp4_38 = arith.constant 0.000000e+00 : f32 loc(#loc143) + %tmp4_39 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc143) + %tmp4_40 = arith.fptosi %tmp4_39 : tensor<512xf32> to tensor<512xi64> loc(#loc143) + %tmp4_41 = tt.load %tmp4_36, %tmp4_37, %tmp4_40 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc143) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc144) + %tmp6 = arith.addi %tmp4_41, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp7 = arith.constant 0 : i32 loc(#loc146) + %tmp7_42 = arith.extsi %tmp7 : i32 to i64 loc(#loc146) + %tmp7_43 = tt.splat %tmp7_42 : i64 -> tensor<512xi64> loc(#loc146) + %tmp7_44 = arith.cmpi slt, %tmp4_41, %tmp7_43 : tensor<512xi64> loc(#loc146) + %tmp8 = arith.select %tmp7_44, %tmp6, %tmp4_41 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc30) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc30) + %1 = tt.splat %0 : i64 -> tensor<512xi64> loc(#loc30) + %2 = arith.cmpi sle, %1, %tmp8 : tensor<512xi64> loc(#loc30) + %3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc31) + %4 = arith.cmpi slt, %tmp8, %3 : tensor<512xi64> loc(#loc31) + %5 = arith.andi %2, %4 : tensor<512xi1> loc(#loc32) + %6 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc33) + %true = arith.constant true loc(#loc34) + %cst = arith.constant dense : tensor<512xi1> loc(#loc34) + %7 = arith.xori %6, %cst : tensor<512xi1> loc(#loc34) + %8 = arith.ori %5, %7 : tensor<512xi1> loc(#loc35) + tt.assert %8, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc36) + %tmp10 = arith.constant 2 : i32 loc(#loc148) + %tmp10_45 = arith.constant 2 : i64 loc(#loc148) + %tmp10_46 = arith.divsi %ks0, %tmp10_45 : i64 loc(#loc148) + %tmp10_47 = arith.constant -1 : i32 loc(#loc149) + %tmp10_48 = arith.constant -1 : i64 loc(#loc149) + %tmp10_49 = arith.muli %tmp10_48, %tmp10_46 : i64 loc(#loc149) + %tmp10_50 = tt.splat %tmp10_49 : i64 -> tensor<512xi64> loc(#loc150) + %tmp10_51 = arith.addi %x0_7, %tmp10_50 : tensor<512xi64> loc(#loc150) + %tmp10_52 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc151) + %tmp10_53 = arith.muli %tmp10_52, %tmp8 : tensor<512xi64> loc(#loc151) + %tmp10_54 = arith.addi %tmp10_51, %tmp10_53 : tensor<512xi64> loc(#loc152) + %tmp10_55 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc153) + %tmp10_56 = tt.addptr %tmp10_55, %tmp10_54 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc153) + %tmp10_57 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc154) + %tmp10_58 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp10_59 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc155) + %tmp10_60 = arith.truncf %tmp10_59 : tensor<512xf32> to tensor<512xbf16> loc(#loc155) + %tmp10_61 = tt.load %tmp10_56, %tmp10_57, %tmp10_60 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc155) + %tmp10_62 = arith.extf %tmp10_61 : tensor<512xbf16> to tensor<512xf32> loc(#loc156) + %tmp11 = arith.mulf %tmp3_35, %tmp10_62 : tensor<512xf32> loc(#loc157) + %tmp12 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp12_63 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc158) + %tmp12_64 = arith.subf %tmp12_63, %tmp11 : tensor<512xf32> loc(#loc158) + %tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc159) + %tmp13_65 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc159) + %tmp14 = arith.select %tmp2_19, %tmp12_64, %tmp13_65 : tensor<512xi1>, tensor<512xf32> loc(#loc160) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp16 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc162) + %tmp16_66 = arith.select %tmp2_19, %tmp14, %tmp16 : tensor<512xi1>, tensor<512xf32> loc(#loc162) + %tmp17 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc163) + %tmp17_67 = arith.cmpi slt, %x0_7, %tmp17 : tensor<512xi64> loc(#loc163) + %tmp18 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc164) + %tmp18_68 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc164) + %tmp18_69 = arith.addi %tmp18_68, %tmp18 : tensor<512xi64> loc(#loc164) + %tmp18_70 = arith.constant 2 : i32 loc(#loc165) + %tmp18_71 = arith.constant 2 : i64 loc(#loc165) + %tmp18_72 = arith.divsi %ks0, %tmp18_71 : i64 loc(#loc165) + %tmp18_73 = arith.constant -1 : i32 loc(#loc166) + %tmp18_74 = arith.constant -1 : i64 loc(#loc166) + %tmp18_75 = arith.muli %tmp18_74, %tmp18_72 : i64 loc(#loc166) + %tmp18_76 = tt.splat %tmp18_75 : i64 -> tensor<512xi64> loc(#loc167) + %tmp18_77 = arith.addi %tmp18_69, %tmp18_76 : tensor<512xi64> loc(#loc167) + %tmp18_78 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc168) + %tmp18_79 = tt.addptr %tmp18_78, %tmp18_77 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc168) + %tmp18_80 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc169) + %tmp18_81 = arith.constant 0.000000e+00 : f32 loc(#loc170) + %tmp18_82 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc170) + %tmp18_83 = arith.truncf %tmp18_82 : tensor<512xf32> to tensor<512xbf16> loc(#loc170) + %tmp18_84 = tt.load %tmp18_79, %tmp18_80, %tmp18_83 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc170) + %tmp18_85 = arith.extf %tmp18_84 : tensor<512xbf16> to tensor<512xf32> loc(#loc171) + %tmp19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc172) + %tmp19_86 = tt.addptr %tmp19, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc172) + %tmp19_87 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc173) + %tmp19_88 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp19_89 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc174) + %tmp19_90 = arith.fptosi %tmp19_89 : tensor<512xf32> to tensor<512xi64> loc(#loc174) + %tmp19_91 = tt.load %tmp19_86, %tmp19_87, %tmp19_90 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc174) + %tmp20 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc175) + %tmp21 = arith.addi %tmp19_91, %tmp20 : tensor<512xi64> loc(#loc176) + %tmp22 = arith.constant 0 : i32 loc(#loc177) + %tmp22_92 = arith.extsi %tmp22 : i32 to i64 loc(#loc177) + %tmp22_93 = tt.splat %tmp22_92 : i64 -> tensor<512xi64> loc(#loc177) + %tmp22_94 = arith.cmpi slt, %tmp19_91, %tmp22_93 : tensor<512xi64> loc(#loc177) + %tmp23 = arith.select %tmp22_94, %tmp21, %tmp19_91 : tensor<512xi1>, tensor<512xi64> loc(#loc178) + %c0_i32_95 = arith.constant 0 : i32 loc(#loc68) + %9 = arith.extsi %c0_i32_95 : i32 to i64 loc(#loc68) + %10 = tt.splat %9 : i64 -> tensor<512xi64> loc(#loc68) + %11 = arith.cmpi sle, %10, %tmp23 : tensor<512xi64> loc(#loc68) + %12 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc69) + %13 = arith.cmpi slt, %tmp23, %12 : tensor<512xi64> loc(#loc69) + %14 = arith.andi %11, %13 : tensor<512xi1> loc(#loc70) + %15 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc71) + %true_96 = arith.constant true loc(#loc72) + %cst_97 = arith.constant dense : tensor<512xi1> loc(#loc72) + %16 = arith.xori %15, %cst_97 : tensor<512xi1> loc(#loc72) + %17 = arith.ori %14, %16 : tensor<512xi1> loc(#loc73) + tt.assert %17, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc74) + %tmp25 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc179) + %tmp25_98 = arith.addi %tmp25, %x0_7 : tensor<512xi64> loc(#loc179) + %tmp25_99 = arith.constant 2 : i32 loc(#loc180) + %tmp25_100 = arith.constant 2 : i64 loc(#loc180) + %tmp25_101 = arith.divsi %ks0, %tmp25_100 : i64 loc(#loc180) + %tmp25_102 = arith.constant -1 : i32 loc(#loc181) + %tmp25_103 = arith.constant -1 : i64 loc(#loc181) + %tmp25_104 = arith.muli %tmp25_103, %tmp25_101 : i64 loc(#loc181) + %tmp25_105 = tt.splat %tmp25_104 : i64 -> tensor<512xi64> loc(#loc182) + %tmp25_106 = arith.addi %tmp25_98, %tmp25_105 : tensor<512xi64> loc(#loc182) + %tmp25_107 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc183) + %tmp25_108 = arith.muli %tmp25_107, %tmp23 : tensor<512xi64> loc(#loc183) + %tmp25_109 = arith.addi %tmp25_106, %tmp25_108 : tensor<512xi64> loc(#loc184) + %tmp25_110 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc185) + %tmp25_111 = tt.addptr %tmp25_110, %tmp25_109 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc185) + %tmp25_112 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc186) + %tmp25_113 = arith.constant 0.000000e+00 : f32 loc(#loc187) + %tmp25_114 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc187) + %tmp25_115 = arith.truncf %tmp25_114 : tensor<512xf32> to tensor<512xbf16> loc(#loc187) + %tmp25_116 = tt.load %tmp25_111, %tmp25_112, %tmp25_115 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc187) + %tmp25_117 = arith.extf %tmp25_116 : tensor<512xbf16> to tensor<512xf32> loc(#loc188) + %tmp26 = arith.mulf %tmp18_85, %tmp25_117 : tensor<512xf32> loc(#loc189) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc190) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc190) + %tmp28 = arith.select %tmp17_67, %tmp26, %tmp27_118 : tensor<512xi1>, tensor<512xf32> loc(#loc191) + %tmp29 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc192) + %tmp29_119 = arith.select %tmp17_67, %tmp28, %tmp29 : tensor<512xi1>, tensor<512xf32> loc(#loc192) + %tmp30 = arith.addf %tmp16_66, %tmp29_119 : tensor<512xf32> loc(#loc193) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc194) + %tmp34_120 = arith.addi %tmp32_16, %tmp34 : tensor<512xi64> loc(#loc194) + %tmp35 = arith.constant 0 : i32 loc(#loc195) + %tmp35_121 = arith.extsi %tmp35 : i32 to i64 loc(#loc195) + %tmp35_122 = tt.splat %tmp35_121 : i64 -> tensor<512xi64> loc(#loc195) + %tmp35_123 = arith.cmpi slt, %tmp32_16, %tmp35_122 : tensor<512xi64> loc(#loc195) + %tmp36 = arith.select %tmp35_123, %tmp34_120, %tmp32_16 : tensor<512xi1>, tensor<512xi64> loc(#loc196) + %c0_i32_124 = arith.constant 0 : i32 loc(#loc93) + %18 = arith.extsi %c0_i32_124 : i32 to i64 loc(#loc93) + %19 = tt.splat %18 : i64 -> tensor<512xi64> loc(#loc93) + %20 = arith.cmpi sle, %19, %tmp36 : tensor<512xi64> loc(#loc93) + %21 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc94) + %22 = arith.cmpi slt, %tmp36, %21 : tensor<512xi64> loc(#loc94) + %23 = arith.andi %20, %22 : tensor<512xi1> loc(#loc95) + %true_125 = arith.constant true loc(#loc96) + %cst_126 = arith.constant dense : tensor<512xi1> loc(#loc96) + %24 = arith.xori %xmask_5, %cst_126 : tensor<512xi1> loc(#loc96) + %25 = arith.ori %23, %24 : tensor<512xi1> loc(#loc97) + tt.assert %25, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc98) + %tmp38 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc197) + %tmp38_127 = arith.muli %tmp38, %tmp36 : tensor<512xi64> loc(#loc197) + %tmp38_128 = arith.addi %x0_7, %tmp38_127 : tensor<512xi64> loc(#loc198) + %tmp38_129 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc199) + %tmp38_130 = tt.addptr %tmp38_129, %tmp38_128 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc199) + %tmp38_131 = tt.load %tmp38_130, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc200) + %tmp38_132 = arith.extf %tmp38_131 : tensor<512xbf16> to tensor<512xf32> loc(#loc201) + %tmp39 = arith.mulf %tmp31_14, %tmp38_132 : tensor<512xf32> loc(#loc202) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc203) + %26 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc106) + %27 = tt.addptr %26, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc106) + %28 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc107) + tt.store %27, %28, %xmask_5 : tensor<512x!tt.ptr> loc(#loc107) + tt.return loc(#loc108) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":24:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:76) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":26:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":26:36) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":28:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":29:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":31:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":31:42) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":31:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":32:32) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":33:18) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":34:18) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":35:32) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:98) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:64) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:115) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:108) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:123) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:49) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:54) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:72) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:123) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":38:19) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":39:13) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":40:38) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":41:34) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":42:12) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":43:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":44:19) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:37) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:55) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:48) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:31) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:68) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:60) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:119) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":46:31) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":46:44) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":46:36) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":47:33) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":48:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":49:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":50:35) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:28) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:100) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:65) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:118) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:110) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:108) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:126) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:55) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:48) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:42) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:64) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:60) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:31) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:80) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:72) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:131) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":53:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":54:38) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":55:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":56:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":57:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":59:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":60:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":61:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:28) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:46) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:56) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:54) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:64) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:36) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:31) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:48) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:88) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":64:20) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":65:20) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:4) +#loc119 = loc("xoffset"(#loc1)) +#loc120 = loc("xoffset"(#loc2)) +#loc121 = loc("xindex"(#loc3)) +#loc122 = loc("xindex"(#loc4)) +#loc123 = loc("xmask"(#loc5)) +#loc124 = loc("x0"(#loc6)) +#loc125 = loc("x1"(#loc7)) +#loc126 = loc("x1"(#loc8)) +#loc127 = loc("tmp31"(#loc9)) +#loc128 = loc("tmp31"(#loc10)) +#loc129 = loc("tmp31"(#loc11)) +#loc130 = loc("tmp32"(#loc12)) +#loc131 = loc("tmp32"(#loc13)) +#loc132 = loc("tmp1"(#loc14)) +#loc133 = loc("tmp2"(#loc15)) +#loc134 = loc("tmp3"(#loc16)) +#loc135 = loc("tmp3"(#loc17)) +#loc136 = loc("tmp3"(#loc18)) +#loc137 = loc("tmp3"(#loc19)) +#loc138 = loc("tmp3"(#loc20)) +#loc139 = loc("tmp3"(#loc21)) +#loc140 = loc("tmp3"(#loc22)) +#loc141 = loc("tmp4"(#loc23)) +#loc142 = loc("tmp4"(#loc24)) +#loc143 = loc("tmp4"(#loc25)) +#loc144 = loc("tmp5"(#loc26)) +#loc145 = loc("tmp6"(#loc27)) +#loc146 = loc("tmp7"(#loc28)) +#loc147 = loc("tmp8"(#loc29)) +#loc148 = loc("tmp10"(#loc37)) +#loc149 = loc("tmp10"(#loc38)) +#loc150 = loc("tmp10"(#loc39)) +#loc151 = loc("tmp10"(#loc40)) +#loc152 = loc("tmp10"(#loc41)) +#loc153 = loc("tmp10"(#loc42)) +#loc154 = loc("tmp10"(#loc43)) +#loc155 = loc("tmp10"(#loc44)) +#loc156 = loc("tmp10"(#loc45)) +#loc157 = loc("tmp11"(#loc46)) +#loc158 = loc("tmp12"(#loc47)) +#loc159 = loc("tmp13"(#loc48)) +#loc160 = loc("tmp14"(#loc49)) +#loc161 = loc("tmp15"(#loc50)) +#loc162 = loc("tmp16"(#loc51)) +#loc163 = loc("tmp17"(#loc52)) +#loc164 = loc("tmp18"(#loc53)) +#loc165 = loc("tmp18"(#loc54)) +#loc166 = loc("tmp18"(#loc55)) +#loc167 = loc("tmp18"(#loc56)) +#loc168 = loc("tmp18"(#loc57)) +#loc169 = loc("tmp18"(#loc58)) +#loc170 = loc("tmp18"(#loc59)) +#loc171 = loc("tmp18"(#loc60)) +#loc172 = loc("tmp19"(#loc61)) +#loc173 = loc("tmp19"(#loc62)) +#loc174 = loc("tmp19"(#loc63)) +#loc175 = loc("tmp20"(#loc64)) +#loc176 = loc("tmp21"(#loc65)) +#loc177 = loc("tmp22"(#loc66)) +#loc178 = loc("tmp23"(#loc67)) +#loc179 = loc("tmp25"(#loc75)) +#loc180 = loc("tmp25"(#loc76)) +#loc181 = loc("tmp25"(#loc77)) +#loc182 = loc("tmp25"(#loc78)) +#loc183 = loc("tmp25"(#loc79)) +#loc184 = loc("tmp25"(#loc80)) +#loc185 = loc("tmp25"(#loc81)) +#loc186 = loc("tmp25"(#loc82)) +#loc187 = loc("tmp25"(#loc83)) +#loc188 = loc("tmp25"(#loc84)) +#loc189 = loc("tmp26"(#loc85)) +#loc190 = loc("tmp27"(#loc86)) +#loc191 = loc("tmp28"(#loc87)) +#loc192 = loc("tmp29"(#loc88)) +#loc193 = loc("tmp30"(#loc89)) +#loc194 = loc("tmp34"(#loc90)) +#loc195 = loc("tmp35"(#loc91)) +#loc196 = loc("tmp36"(#loc92)) +#loc197 = loc("tmp38"(#loc99)) +#loc198 = loc("tmp38"(#loc100)) +#loc199 = loc("tmp38"(#loc101)) +#loc200 = loc("tmp38"(#loc102)) +#loc201 = loc("tmp38"(#loc103)) +#loc202 = loc("tmp39"(#loc104)) +#loc203 = loc("tmp40"(#loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..914fcf69401fb42ea04f6a1adf49c0df205ab02c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<512xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<512xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<512xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<512xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<512xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<512xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<512xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<512xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<512xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<512xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<512xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fa6dd2268e4bbad0017d5addb353dabd811dcc49 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<512xi64> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_1 = arith.constant dense : tensor<512xi1> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32> to tensor<512xi64> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16> to tensor<512xf32> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<512xi64> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1>, tensor<512xi64> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst : tensor<512xi64> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst_1 : tensor<512xi1> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16> to tensor<512xf32> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<512xi64> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst : tensor<512xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst_1 : tensor<512xi1> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16> to tensor<512xf32> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<512xi64> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1>, tensor<512xi64> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst : tensor<512xi64> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1> loc(#loc77) + %13 = arith.xori %xmask_6, %cst_1 : tensor<512xi1> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16> to tensor<512xf32> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/df/cdfb6cgenzsju5cqvy4244xh4xidniyeznvkubvdg2mg6d5oc6xt.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/docs/_static/css/custom_log.css b/SpecForge-ext/docs/_static/css/custom_log.css new file mode 100644 index 0000000000000000000000000000000000000000..61f65d0199df9e97886560f7f97c6c9b026bd34e --- /dev/null +++ b/SpecForge-ext/docs/_static/css/custom_log.css @@ -0,0 +1,29 @@ +.output_area { + color: #615656; +} + +table.autosummary td { + width: 50% + } + + img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +.output_area.stderr { + color: #d3d3d3 !important; +} + +.output_area.stdout { + color: #d3d3d3 !important; +} + +div.output_area.stderr { + color: #d3d3d3 !important; +} + +div.output_area.stdout { + color: #d3d3d3 !important; +} diff --git a/SpecForge-ext/docs/_static/css/readthedocs.css b/SpecForge-ext/docs/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..aca6649b436a35cf39b2c924ce2f74ed2cdc8b90 --- /dev/null +++ b/SpecForge-ext/docs/_static/css/readthedocs.css @@ -0,0 +1,9 @@ +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} diff --git a/SpecForge-ext/docs/advanced_features/customization.md b/SpecForge-ext/docs/advanced_features/customization.md new file mode 100644 index 0000000000000000000000000000000000000000..47b624a9ce461b5f37aa6c159bcb306657c68ed4 --- /dev/null +++ b/SpecForge-ext/docs/advanced_features/customization.md @@ -0,0 +1,118 @@ +# 💡 Customize Your Own Training + +## 🔧 Customize Training Args + +```bash +torchrun \ + --standalone \ + --nproc_per_node 8 \ + ./scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config ./configs/llama3-8B-eagle3.json \ + --train-data-path ./cache/dataset/sharegpt.jsonl \ + --output-dir ./outputs/llama3-8b-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 2048 \ + --chat-template llama3 \ + --cache-dir ./cache +``` + +If you wish to understand what each argument does, you can run `python scripts/train_eagle3.py --help` to see the full list of arguments. Particularly, we will discuss some important arguments below. +- `--chat-template`: This should be the chat template to use for the model, so please make sure you set it to the correct value. +- `--cache-dir`: This directory contains the dataset cache including the `input_ids`, `loss_mask`, `attention_mask` and `vocab_mapping`. These caches can make your data loading much faster once a cache is generated. The cache file has a name which is obtained by hashing the dataset path to avoid cache collision. + +## 💬 Customize Chat Template + +You can register a new chat template for your model by adding a new entry to the `TEMPLATE_REGISTRY` in the `specforge.data.template.py` file. + +```python +TEMPLATE_REGISTRY.register( + name="your-template-name", + template=ChatTemplate( + assistant_header="xxx", + user_header="xxx", + system_prompt="xxx", + end_of_turn_token="xxx", + ), +) +``` + +## 🪅 Customize Model + +### Customize Target Model + +If you wish to train Eagle3 for other models, you need to modify the `--target-model-path` value. We support loading these models directly from HuggingFace. + +However, if your model is too large and requires tensor parallelism, you can implement its tensor parallel version on your own in the `specforge.modeling.target` directory. The CausalLM model should inherit the `DistributedTargetModel` class in the `specforge.modeling.target.base.py` file and apply `ColumnParallelLinear` and `RowParallelLinear` to its submodules. + +```python +from .base import DistributedTargetModel +from specforge.layers.linear import ColumnParallelLinear, RowParallelLinear + + +class MyModelForCausalLM(MyModelPreTrainedModel, GenerationMixin, DistributedTargetModel): + ... + + def load_weights(self, state_dict: Dict[str, torch.Tensor]): + ... +``` + +Afterwards, you need to register this model to the `AutoEagle3TargetModel` class in the `specforge.modeling.auto.py` file. + +```diff +class AutoDistributedTargetModel(AutoModelForCausalLMBase): + _model_mapping = { + Llama4TextConfig: [Llama4ForCausalLM], ++ MyModelConfig: [MyModelForCausalLM], + } +``` + +When `tp_size` is greater than 1, the script will automatically load the distributed version of the model for tensor parallelism. + +### Customize Draft Model + +If you want to change the draft model configuration, you can write your own configuration file and pass its path to the `--draft-model-config` argument. Or, if you do not provide the `--draft-model-config` argument, the script will automatically generate the draft model configuration based on the target model configuration. If you wish to serve your customized draft model with SGLang, make sure you implement the draft model in SGLang as well and the architecture name must match. To implement your own draft model, you can create a new class and inherit it from the `Eagle3DraftModel` class in the `specforge.modeling.draft.base.py` file. + + +```python +from .base import Eagle3DraftModel +from transformers import PretrainedConfig + + +class MyModelConfig(PretrainedConfig): + model_type = "mymodel" + + def __init__(self, **kwargs): + ... + + +class MyModelEagle3(Eagle3DraftModel): + + config_class = MyModelConfig + + def __init__(self, config, quant_config=None) -> None: + ... +``` + +You can then register these models to the `AutoEagle3TargetModel` and `AutoDraftModelConfig` classes in the `specforge.modeling.auto.py` file for the automatic model loading. + +```diff +class AutoEagle3DraftModel(AutoModelForCausalLMBase): + # the model mapping is currently hardcoded, we should support lazy model mapping via registry + _model_mapping = { + LlamaConfig: [LlamaForCausalLMEagle3], ++ MyModelConfig: MyModelEagle3, + } + + +class AutoDraftModelConfig: + + _config_mapping = { + "LlamaForCausalLMEagle3": LlamaConfig, ++ "MyModelEagle3": MyModelConfig, + } +``` + +In this way, as long as your `config.json` specifies the correct architecture name, the script will automatically load the correct draft model for you. diff --git a/SpecForge-ext/docs/basic_usage/data_preparation.md b/SpecForge-ext/docs/basic_usage/data_preparation.md new file mode 100644 index 0000000000000000000000000000000000000000..e19938add815ada108d47a1d0c54d3545e63e7fc --- /dev/null +++ b/SpecForge-ext/docs/basic_usage/data_preparation.md @@ -0,0 +1,128 @@ +# 📝 Data Preparation + +## 📍 Overview + +Data is an important aspect of speculative decoding as the quality of the dataset directly affects the acceptance rate of the draft model. In this section, we will introduce how to prepare the dataset for both online and offline training. + +## ☁️ Pre-supported Datasets + +We have provided a script to prepare some sample datasets out of the box, these datasets include: +1. [ultrachat](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) (200k) +2. [sharegpt](https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered) (120k) +3. [perfectblend](https://huggingface.co/datasets/mlabonne/open-perfectblend) (1.4M) +4. and others (we continuously add support for more datasets) + +You can run the script below to prepare the corresponding dataset. + +```bash +# ultrachat +python scripts/prepare_data.py --dataset ultrachat + +# sharegpt +python scripts/prepare_data.py --dataset sharegpt +``` + +You can view the full list of pre-supported datasets using `python scripts/prepare_data.py --help`. The datasets are processed and saved as `jsonl` files in the `cache/dataset/` directory of the project path by default. + + +## ↩️ Regenerate Datasets + +When training speculative decoding draft models for a specific target model, instead of using the original dataset, we can regenerate the assistant responses using the target model to better align the draft model with the target model's output distribution. This will improve the acceptance rate of the draft model and the overall performance of the speculative decoding. According to the [EAGLE1 paper](https://arxiv.org/pdf/2401.15077), the EAGLE method is not very sensitive to the dataset quality, which means the performance is still good even if you use the original dataset. However, if you are looking for optimal performance in the production environment, it is recommended to regenerate the dataset using the target model. + +We can follow the following steps to regenerate the dataset. In the example below, we will use `meta-llama/Llama-3.1-8B-Instruct` as an example, you can replace it with your own target model. + +1. Start the SGLang server for the target model. + +```shell +python3 -m sglang.launch_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --cuda-graph-bs 1 2 4 8 16 32 64 128 \ + --dtype bfloat16 \ + --mem-frac=0.8 \ + --port 30000 +``` + +2. Regenerate the dataset using the `regenerate_train_data.py` script. + +```shell +python scripts/regenerate_train_data.py \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --concurrency 128 \ + --max-tokens 98304 \ + --server-address localhost:30000 \ + --temperature 0.8 \ + --input-file-path ./cache/dataset/sharegpt_train.jsonl \ + --output-file-path ./cache/dataset/sharegpt_train_regen.jsonl +``` + +For maximum performance, we recommend to scale the number of GPUs to regenerate the dataset in data parallel mode. To do this, you can simply add more server addresses to the `--server-address` argument, e.g. `--server-address localhost:30000 localhost:30001 localhost:30002 localhost:30003`. + + +## 🤩 Prepare your own dataset + +Besides the provided datasets, you can also prepare your own dataset. We support two formats: + +#### Option 1: Conversation Format + +You should prepare the dataset in jsonl format and the schema should look like this: + +```json +{ + "id": "xxxx", + "conversations": [ + { + "role": "user | assistant", + "content": "The message content" + } + ], +} +``` + +#### Option 2: Pre-formatted Text Format + +If you already have conversations formatted with a specific chat template, you can use the pre-formatted text directly: + +```json +{ + "id": "xxxx", + "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there!<|im_end|>\n" +} +``` + +This format is useful when you have pre-formatted prompts that were used during training of the target model and have raw generations from the target model. + +To use pre-formatted datasets, add the `--is-preformatted` flag to your training command. Note that the `--chat-template` parameter is still needed and should match the template used in your pre-formatted text, as it is used to identify user/assistant tokens to determine the assistant spans and generate the corresponding loss mask. + +```bash +# Online training with pre-formatted data +torchrun --standalone --nproc_per_node 8 \ + scripts/train_eagle3.py \ + --is-preformatted \ + --train-data-path ./your_preformatted_dataset.jsonl \ + # ... other arguments +``` + +For offline training, you can also use `--is-preformatted` when generating hidden states: + +```bash +# Generate hidden states from pre-formatted data +torchrun --nproc_per_node=8 \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --data-path ./your_preformatted_dataset.jsonl \ + --output-path ./cache/hidden_states \ + --chat-template llama3 \ + --is-preformatted \ + --max-length 2048 +``` + +Once you have the `jsonl` file ready, you can proceed with online training or generate hidden states for offline training. See the Training guide for more details. + + +## ➕ Handling Multiple Datasets + +If you have multiple datasets, you can just merge them into the one jsonl file. For example, you can do something like this + +```bash +cat dataset1.jsonl dataset2.jsonl > merged_dataset.jsonl +``` diff --git a/SpecForge-ext/docs/basic_usage/training.md b/SpecForge-ext/docs/basic_usage/training.md new file mode 100644 index 0000000000000000000000000000000000000000..a41b5a0dee1a9a12620f25ae26f613f4711d0b7c --- /dev/null +++ b/SpecForge-ext/docs/basic_usage/training.md @@ -0,0 +1,62 @@ +## 🚀 Training + +## 📍 Overview + +Existing speculative decoding methods such as EAGLE3 requires training in the feature-space, which means the draft model relies on the hidden states generated from the target model for autoregressive prediction. In SpecForge, we provide two orthogonal paths to cater to the users' specific needs when training this kind of draft models. We name these two methods as `Online` and `Offline`. By definition, it is easy to understandd them: + +- **`Online`**: the hidden states are generated on the fly during training. +- **`Offline`**: the hidden states are generated beforehand, stored to the disk, and loaded back to GPU during training. + +Online training is suitable for users with limited disk space but sufficient GPUs while offline training is suitable for users with sufficient disk space but limited GPUs. + +| Method | Target Model | Disk Space Requirement | GPU Requirement | One-liner rationale | +| --- | --- | --- | --- | --- | +| Online | Used during training | Small | More GPUs are needed if your target model is large | Generating auxiliary hidden states on the fly | +| Offline | Only used during data preparation | Huge (e.g. ultrachat+sharegpt will need 12TB storage ) | as low as 1 GPU, as only need to accommodate the draft model | Preparing auxiliary hidden states beforehand and only once | + +> **Why does disk matter?** +> During Eagle3 training, the frozen target model will first generate the hidden states for each token given the data sample. The hidden states are fed to the draft model for training. +> Offline mode stores these hidden states to the local disk, so a small disk can be filled up fast. +> Online mode only generates these hidden states on the fly without storing them to the disk, but needs to keep the target model resident in memory during training, trading GPU RAM for almost-zero disk footprint. + +## 🏎️ Online Training + +We have provided training scripts for the EAGLE3 models in the `examples` directory. These scripts cover a wide range of models range from Llama to Qwen, small to large and dense to MoE. Online training is often conducted in two steps and we will use ShareGPT and Llama3-8B-Instruct as an example. + +**Step 1: Prepare the dataset** + +```bash +# prepare the dataset +python scripts/prepare_data.py --dataset sharegpt +``` + +**Step 2: Start the training** + +```bash +# train llama3-8B-instruct +bash ./examples/run_llama3.1_8b_eagle3_online.sh +``` + +## 💨 Offline Training + +The difference between online and offline training is that we need to generate the hidden states before training. We also use ShareGPT and Llama3-8B-Instruct as an example. + +**Step 1: Prepare the dataset** + +Same as above + +**Step 2: Generate the hidden states and train** + +```bash +# train llama3-8B-instruct in an offline manner +bash ./examples/run_llama3.1_8b_eagle3_offline.sh +``` + +It is important to note that the `run_llama3.1_8b_eagle3_offline.sh` script consists of two steps: + +1. Generate the hidden states using the `prepare_hidden_states.py` script. This script will generate the hidden states for the test and train datasets and save them to the disk. +2. Train the model: suppling the `--train-hidden-states-path` argument to the script so that the script will load the hidden states from the disk during training. + +## 📈 Experiment Tracking + +This project supports logging training progress to Wandb, TensorBoard, and SwanLab. You can enable tracking by adding the `--report-to` argument to the command line in your shell script. diff --git a/SpecForge-ext/docs/benchmarks/benchmark.md b/SpecForge-ext/docs/benchmarks/benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..29a51b35d5d7639ebd666202aad3377063e4ee12 --- /dev/null +++ b/SpecForge-ext/docs/benchmarks/benchmark.md @@ -0,0 +1,67 @@ +# Benchmarking for Speculative Decoding + +## Overview + +We provide a unified script to test the performance of the Speculative Decoding with EAGLE3 algorithm on multiple datasets. You can follow the steps below to run the benchmarks. + +## Run Benchmarks + +### Launch SGLang and Benchmarker Concurrently + +`bench_eagle3.py` can help you launch a SGLang server process and a Benchmarking process concurrently. In this way, you don't have to launch the SGLang server manually, this script will manually handle the SGLang launch under different speculative decoding configurations. Some important arguments are: +- `--model-path`: the path to the target model. +- `--speculative-draft-model-path`: the path to the draft model. +- `--port`: the port to launch the SGLang server. +- `--trust-remote-code`: trust the remote code. +- `--mem-fraction-static`: the memory fraction for the static memory. +- `--tp-size`: the tensor parallelism size. +- `--attention-backend`: the attention backend. +- `--config-list`: the list of speculative decoding configuration to test, the format is `,,,`. +- `--benchmark-list`: the list of benchmarks to test, the format is `::`. + +```shell +python3 bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --port 30000 \ + --trust-remote-code \ + --mem-fraction-static 0.8 \ + --tp-size 1 \ + --attention-backend fa3 \ + --config-list 1,0,0,0 1,3,1,4 \ + --benchmark-list mtbench gsm8k:5 ceval:5:accountant \ + --dtype bfloat16 +``` + +### Launch Benchmarker Independently + +If you want to launch the SGLang server independently, you can use the following command. + +```shell +# you can launch a server +python3 -m sglang.launch_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --mem-fraction-static 0.75 \ + --cuda-graph-max-bs 1 \ + --tp 1 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 30000 \ + --dtype bfloat16 +``` + +Then we can start benchmarking. Note that you should use the same host and port as the one used in the SGLang server. Note that `--skip-launch-server` is required to skip the launch of the SGLang server. + +```bash +python bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --port 30000 \ + --config-list 1,3,1,4 \ + --benchmark-list mtbench:5 ceval:5:accountant gsm8k:5 humaneval:5 math500:5 mtbench:5 aime:1 \ + --skip-launch-server +``` diff --git a/SpecForge-ext/docs/benchmarks/dashboard.md b/SpecForge-ext/docs/benchmarks/dashboard.md new file mode 100644 index 0000000000000000000000000000000000000000..32209b54ae412153db153fd5efd9bab47db424d6 --- /dev/null +++ b/SpecForge-ext/docs/benchmarks/dashboard.md @@ -0,0 +1,24 @@ +# Interactive Benchmark Dashboard + +View the interactive benchmark dashboard to explore SpecForge performance results: + +**[🚀 Spec Bundle](../dashboard/index.html)** + + +The dashboard displays the following key metrics: + +- **Acceptance Length**: Average number of tokens accepted per speculation step +- **Throughput**: Output tokens generated per second (tokens/s) +- **Speedup**: Performance improvement ratio over baseline + +## Benchmark Datasets + +View results across multiple benchmarks: +- MTBench +- HumanEval +- GSM8K +- Math500 + +--- + +If the dashboard doesn't load, please ensure JavaScript is enabled in your browser. diff --git a/SpecForge-ext/docs/community_resources/dashboard.md b/SpecForge-ext/docs/community_resources/dashboard.md new file mode 100644 index 0000000000000000000000000000000000000000..e7075db4bcb29e8ea3e40b16494fd7413544c3b1 --- /dev/null +++ b/SpecForge-ext/docs/community_resources/dashboard.md @@ -0,0 +1,19 @@ +# 📈 Performance Dashboard + +## Overview + +To better visualize the performance of the SpecBundle draft models, we have built a dashboard to offer interactive experiences to users to explore the evaluation results. We evaluate the performance of SpecBundle draft models under different speculative decoding configurations (i.e. steps, topk, num_draft_tokens) on various benchmarks, the benchmarks include: + +- Conversation + - MTBench +- General Knowledge + - GPQA + - FinanceQA +- Math + - GSM8K + - Math500 +- Coding + - HumanEval + - LiveCodeBench + +Check out the [Performance Dashboard](https://docs.sglang.io/SpecForge/SpecBundle/index.html) for more details. diff --git a/SpecForge-ext/docs/community_resources/specbundle.md b/SpecForge-ext/docs/community_resources/specbundle.md new file mode 100644 index 0000000000000000000000000000000000000000..5efb84e84e72a98be42ce445eff6c0a5e7d6bcda --- /dev/null +++ b/SpecForge-ext/docs/community_resources/specbundle.md @@ -0,0 +1,93 @@ +# 🔥 SpecBundle + +
+ specbundle logo +
+ + +## About SpecBundle + +Speculative decoding, especially EAGLE3, offer strong theoretical guarantees alongside consistent empirical improvements in token acceptance rate and end-to-end inference speed. However, despite these advances, adoption of speculative decoding—especially EAGLE3—remains limited in the open-source ecosystem, due primarily to three key factors. + +1. Lack of production-ready training infrastructure: Existing speculative decoding toolchains are largely research prototypes, offering limited system-level optimization and inadequate support for diverse architectures and large-scale models. +2. Scarcity of high-quality draft models: Effective speculative decoding depends on strong draft models, yet publicly available EAGLE3-compatible checkpoints are extremely limited, primarily originating from the original authors. +3. Insufficient training scale of existing drafts: Most available draft models are trained on small or curated datasets and fail to generalize to the large, diverse corpora used in modern LLM training, resulting in low token acceptance rates and diminished practical speedups. + +**SpecBundle** is a direct response to these limitations. Jointly driven by the open-source community and industry partners including **Ant Group**, **Meituan**, **Nex-AGI** and **EigenAI**, **SpecBundle** represents the **first open initiative** aimed at democratizing speculative decoding by providing high-performance, production-grade EAGLE3 draft model weights for mainstream open-source LLMs. This initiative also serves to verify the robustness of the **SpecForge** framework through multiple scales and architectures. + +We call for all open-source developers and industry partners to join this exciting initiative. + +## Performance Scores + +We evaluate the performance of SpecBundle draft models on various benchmarks, please visit the [Performance Dashboard](https://docs.sglang.io/SpecForge/SpecBundle/index.html) for more details. + +## Usage + +You can use the following command to launch the SGLang server with SpecBundle models. Please add `--tp`, `--ep` and `--mem-fraction-static` arguments when you encounter memory issues. + +```bash +python3 -m sglang.launch_server \ + --model \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 +``` + +## Released Models + +We list the models released by the SpecForge and several industrial partners below. These models are released as part of the SpecBundle models, which are trained on large-scale multi-domain datasets and deliver exceptional performance on various benchmarks. + +> We also include some of the models previously trained by the SpecForge team but not technically part of the SpecBundle release. +> We mark models trained on ShareGPT+Ultrachat datasets with a **\*** mark and models trained on Perfect-Blend datasets but released before SpecBundle with **+** mark. + +### Llama Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| meta-llama/Llama-3.1-8B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-3.1-8B-Instruct) | +| meta-llama/Llama-3.3-70B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-3.3-70B-Instruct) | +| meta-llama/Llama-4-Scout-17B-16E-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-4-Scout-17B-16E-Instruct) | +| meta-llama/Llama-4-Maverick-17B-128E-Instruct | [🤗 Model *](https://huggingface.co/lmsys/sglang-EAGLE3-Llama-4-Maverick-17B-128E-Instruct-v1) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-4-Maverick-17B-128E-Instruct) | + +### Qwen Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| Qwen/Qwen3-30B-A3B-Instruct-2507 | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge-Nex) | [🤗 Dataset](https://huggingface.co/datasets/lukeysong/qwen-30b-regen-blend) | +| Qwen/Qwen3-235B-A22B-Instruct-2507 | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge-Meituan) | [🤗 Dataset](https://huggingface.co/datasets/lukeysong/qwen3-235-regen-perfect_blend) | +| Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-perfect-blend-regenerated) | [🤗 Dataset](https://huggingface.co/datasets/lukeysong/qwen3-80b-regen-prefectblend) | + +### Qwen Coder Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| Qwen/Qwen3-Coder-30B-A3B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/JinnP/opc_regen_Qwen3-Coder-30B-A3B-Instruct) | +| Qwen/Qwen3-Coder-480B-A35B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI) | - | + +### Ling Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| inclusionAI/Ling-flash-2.0 | [🤗 Model](https://huggingface.co/AQ-MedAI/Ling-Flash-2.0-eagle3) | - | + +### Kimi Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| moonshotai/Kimi-K2-Instruct | [🤗 Model](https://huggingface.co/AQ-MedAI/Kimi-K2-Instruct-eagle3) | - | + +### GPT-OSS Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| openai/gpt-oss-20b | [🤗 Model +](https://huggingface.co/zhuyksir/EAGLE3-gpt-oss-20b-bf16) | [🤗 Dataset](https://huggingface.co/datasets/zhuyksir/perfect-blend-gptoss-20B-1M) | +| openai/gpt-oss-120b | [🤗 Model +](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16) | - | + +### Nex Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| nex-agi/Qwen3-30B-A3B-Nex-N1 | [🤗 Model](https://huggingface.co/nex-agi/SGLANG-EAGLE3-Qwen3-30B-A3B-Nex-N1) | - | +| nex-agi/Qwen3-32B-Nex-N1 | [🤗 Model](https://huggingface.co/nex-agi/SGLANG-EAGLE3-Qwen3-32B-Nex-N1) | - | diff --git a/SpecForge-ext/docs/concepts/EAGLE3.md b/SpecForge-ext/docs/concepts/EAGLE3.md new file mode 100644 index 0000000000000000000000000000000000000000..fff48dc0a8c49a5e2df53917ee723a6515fb7830 --- /dev/null +++ b/SpecForge-ext/docs/concepts/EAGLE3.md @@ -0,0 +1,19 @@ +# 🦅 EAGLE3 + +## 📍 Overview + +In the previous speculative decoding practices, we usually choose a small language model from the same family as the draft model. For example, we can use `Llama-3.1-8B-Instruct` as the draft model and `Llama-3.1-70B-Instruct` as the target model. However, this approach is not always feasible because the small language model may not always be available. Thus, researchers have proposed to train a separate small model as the speculator, this type of models usually use the target model's hidden states or KV cache as input to predict the next few tokens. + +Among this type of models, EAGLE3 is the state-of-the-art and has been integrated in [SGLang](https://github.com/sgl-project/sglang). It relies on the hidden states of the target model and often consists of only one dense decoder layer. Before you read on, you can revisit the details of [speculative decoding](./speculative_decoding.md) first if not familiar. + +## 🔧 How it works? + +

+ EAGLE3
+ Source: Blog by NVIDIA +

+ +The workflow of EAGLE3 is shown in the animation above. It differs from other speculative decoding methods in several ways: +1. **`Feature-based Drafting`**: Unlike other speculative decoding methods which directly feeds the tokens to the draft model to generate predictions, EAGLE3 operates in the feature space. It will extract the 3 hidden states from the target model at 3 layers at different depths and concatenate them together to form a single feature vector. This feature vector will be fed to the draft model to generate predictions. +2. **`Training-time Test`**: During training, EAGLE3 simulate the autoregressive generation process by autoregressively generating the next few tokens. It then computes the loss between the predicted output sequence and the ground truth sequence. This method improves the draft model performance because it reduces the generation errors accumulated from previous tokens for higher acceptance rate. +3. **`Dynamic Draft Tree`**: EAGLE3 uses a dynamic draft tree to store the candidate tokens as proposed in [EAGLE2](https://arxiv.org/abs/2406.16858). In simple words, it will only store the candidate tokens that are most likely to be accepted by the target model to improve the acceptance rate. diff --git a/SpecForge-ext/docs/concepts/speculative_decoding.md b/SpecForge-ext/docs/concepts/speculative_decoding.md new file mode 100644 index 0000000000000000000000000000000000000000..283c6167332e958c923ec6b72da58013676089e1 --- /dev/null +++ b/SpecForge-ext/docs/concepts/speculative_decoding.md @@ -0,0 +1,30 @@ +# 💭 Speculative Decoding + +## 📍 Overview + +One existing challenge of LLM inference is the latency. As LLMs autoregressively generate the output token by token, the decoding process is largely bottlenecked by the memory bandwidth, i.e. the inference engine needs to load the whole model weights into memory for each token generation. The idea of speculative decoding stems from the thought that we can use a small model to predict the next few tokens in advance and let our main model to verify these tokens in sequence. As the decoding process is memory-bound, the time taken to verify multiple tokens is comparable to the time taken to generate a single token. In this way, we can speed up the decoding process significantly by speculating the next few tokens in advance. + +## 🔧 How it works? + +In speculative decoding, we have two models: +1. **`Target Model`**: a large model that is intended to serve the users, e.g. the model you want to deploy for production. +2. **`Draft Model`**: a small model that is trained to predict the next few tokens in advance. This can be in various forms, e.g. an n-gram model, a pretrained small language model (often from the same model family), a separately trained small model (EAGLE). + +

+ Drafting
+ Source: Blog by NVIDIA +

+ +The role of the draft model is to predict the next few tokens in advance, and the role of the target model is to verify the tokens predicted by the draft model. As shown in the animation above, the workflow of speculative decoding can be decomposed into 3 stages: + +- **`prefill`**: the target model will first take the prompt as the input and run the prefill stage. +- **`drafting`**: Afterwards, we let the draft model to iteratively predict the next N candidate tokens. Since the draft model is often much smaller than the target model, the drafting time is insignificant. +- **`verification`**: We then pass the N candidate tokens to the target model to verify in parallel. Since this stage is memory-bound, it does not increase the latency significantly by increasing the number of tokens. If a token is accepted by the target model, it will be added to the output sequence, otherwise, it will be discarded. The draft model will continue to predict the next tokens based on the last accepted token and this process will repeat until the end of the sequence is reached. + +One advantage of speculative decoding is that it guarantees the output distribution is the same as that of using the target model alone. This is because the target model will decide the acceptance of the candidate tokens using rejection sampling. The speculative paper has provided a mathematical proof for its correctness in the [appendix section](https://arxiv.org/pdf/2211.17192#page=10.10). +In simple words, it will only accept the candidate tokens that are most likely to be correct. Let's notate the probability of a token generated by the target model as $p(x)$ and the probability of a token generated by the draft model as $q(x)$. If $q(x) < p(x)$, then the token will be accepted. If $q(x) > p(x)$, the target model will reject the token with probability $1 - p(x)/q(x)$ and sample a new token from the distribution $p'(x) = \text{norm}(max(0, p(x) - q(x)))$. Below shows an animation of the verification process. + +

+ Verification
+ Source: Blog by NVIDIA +

diff --git a/SpecForge-ext/docs/examples/llama3-eagle3-offline.md b/SpecForge-ext/docs/examples/llama3-eagle3-offline.md new file mode 100644 index 0000000000000000000000000000000000000000..a8449cc612073c9ca76804891ca9339fad9c8ca1 --- /dev/null +++ b/SpecForge-ext/docs/examples/llama3-eagle3-offline.md @@ -0,0 +1,57 @@ +# Eagle3 for Llama3 - Offline + +## Introduction + +This document provides a step-by-step guide on how to train the EAGLE3 model for the Llama3.1-8B-Instruct model in an offline manner. In offline training, we generate the hidden states required by EAGLE3 draft model beforehand and store them to the disk. During training, we load them back to the GPU memory. As offline training requires a lot of disk space, we do not recommend running this on large datasets such as Perfect-Blend. + +## Training on ShareGPT dataset + +### **Step 1. Prepare ShareGPT dataset** + +First of all, we should download the dataset. + +```shell +python ./scripts/prepare_data.py --dataset sharegpt +``` + +### **Step 2. Prepare Hidden States** + +We need to prepare the hidden states for the training. + +```shell +torchrun \ + --standalone \ + --nproc_per_node 8 \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --enable-aux-hidden-states \ + --data-path ./cache/dataset/sharegpt_train.jsonl \ + --output-path ./cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --chat-template llama3 \ + --max-length 4096 \ + --tp-size 1 \ + --batch-size 32 +``` + +The hidden states will be saved to the disk in the `output-path` directory. + +### **Step 3. Start Training** + +```shell +torchrun \ + --standalone \ + --nproc_per_node 8 \ + ./scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config ./configs/llama3-8B-eagle3.json \ + --train-data-path ./cache/dataset/sharegpt_train.jsonl \ + --train-hidden-states-path ./cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --output-dir ./outputs/llama3-8b-eagle3-sharegpt-offline \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --cache-dir ./cache +``` diff --git a/SpecForge-ext/docs/examples/llama3-eagle3-online.md b/SpecForge-ext/docs/examples/llama3-eagle3-online.md new file mode 100644 index 0000000000000000000000000000000000000000..13dd2fdd1c9ed9b9f06505a52b5db272c5a3bd49 --- /dev/null +++ b/SpecForge-ext/docs/examples/llama3-eagle3-online.md @@ -0,0 +1,75 @@ +# Eagle3 for Llama3 - Online + +## Introduction + +This document provides a step-by-step guide on how to train the EAGLE3 model for the Llama3.1-8B-Instruct model in an online manner. In online training, we generate the hidden states required by EAGLE3 draft model on the fly during training. This example is using `ShareGPT` dataset for training, the performance is not optimal due to the size and limited coverage of the dataset. If you look for optimal performance, we recommend you to try more diverse datasets such as [`Perfect-Blend`](https://huggingface.co/datasets/facebook/perfect-blend). We have also included a section on training on `Perfect-Blend` dataset at the end of this document. + + +## Training on ShareGPT dataset + +### **Step 1. Prepare ShareGPT dataset** + +First of all, we should download the dataset. + +```shell +python ./scripts/prepare_data.py --dataset sharegpt +``` + +### **Step 2. Launch Online Training** + +```shell +torchrun \ + --standalone \ + --nproc_per_node 8 \ + scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config configs/llama3-8B-eagle3.json \ + --train-data-path ./cache/dataset/sharegpt_train.jsonl \ + --output-dir ./outputs/llama3-8b-eagle3 \ + --num-epochs 2 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --target-model-backend sglang \ +``` + +### **Step 3. Benchmark** + +For `Llama3.1-8B`, we add a system prompt to all training data, following the approach used in the official repository. Consequently, when benchmarking, we should also include this system prompt to obtain the full accept length. Please uncomment the corresponding line and add the system prompt. + +The four numbers in the config represent: `batch_size, num_steps, topk, num_verify_tokens`. You can adjust the values in the config list to experiment with different test cases. + +A pre-trained EAGLE model is available at [zhuyksir/EAGLE3-Llama-3.1-8B-Instruct](https://huggingface.co/zhuyksir/EAGLE3-Llama-3.1-8B-Instruct) for reference. + +```shell +cd benchmarks + +config_list=( + "4,3,1,4" + "4,7,10,60" +) +python3 bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --speculative-draft-model-path /YOUR/PATH/Llama-3.1-8B-Instruct/dev_outputs/epoch_0 \ + --port 30000 \ + --mem-fraction-static 0.8 \ + --tp-size 1 \ + --config-list "${config_list[@]}" \ + --benchmark-list mtbench gsm8k humaneval math500 +``` + + +## Training on Perfect-Blend dataset + +### **Step 1. Prepare Perfect-Blend dataset** + +First of all, we should download the dataset. + +```shell +python ./scripts/prepare_data.py --dataset perfectblend +``` + +### **Step 2. Launch Online Training** + +We just need to change the `--train-data-path` to the path of the Perfect-Blend dataset (e.g. `./cache/dataset/perfectblend_train.jsonl`), then we can launch training smoothly. diff --git a/SpecForge-ext/docs/get_started/about.md b/SpecForge-ext/docs/get_started/about.md new file mode 100644 index 0000000000000000000000000000000000000000..e98794b56bc94d4f54cd08ce95405d4792dfb002 --- /dev/null +++ b/SpecForge-ext/docs/get_started/about.md @@ -0,0 +1,13 @@ +# ⚡️ About SpecForge + +## 💡 Motivation + +Speculative decoding is an important and powerful technique for speeding up inference without losing performance. Industries have used it extensively in production to better serve their users with lower latency and higher throughput. We have seen some open-source projects for training speculative decoding models, but most of them are not well-maintained or not directly compatible with SGLang. We prepared this project because we wish that the open-source community can enjoy a speculative decoding framework that is + +- regularly maintained by the SGLang team: the code is runnable out-of-the-box +- directly compatible with SGLang: there is no additional efforts for porting to SGLang +- provide performant training capabilities: we provided online/offline/tensor-parallel/FSDP to suit your needs + +## ✅ SGLang-ready + +As SpecForge is built by the SGLang, we ensure that the draft models trained with SpecForge are directly compatible with [SGLang](https://github.com/sgl-project/sglang). This means that no postprocessing or weights conversion is required, providing users with a seamless experience from training to serving. We export our data in the Hugging Face format, so you can load it to other serving frameworks as well if the model is supported by them. diff --git a/SpecForge-ext/docs/get_started/installation.md b/SpecForge-ext/docs/get_started/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..e37169a11389c977667569e069436a348090b30f --- /dev/null +++ b/SpecForge-ext/docs/get_started/installation.md @@ -0,0 +1,26 @@ +# 🚀 Get Started + +## 📦 Installation + +To install this project, you can simply run the following command. + +- **Install from source (recommended)** + +```bash +# git clone the source code +git clone https://github.com/sgl-project/SpecForge.git +cd SpecForge + +# create a new virtual environment +uv venv -p 3.11 +source .venv/bin/activate + +# install specforge +uv pip install -v . --prerelease=allow +``` + +- **Install from PyPI** + +```bash +pip install specforge +``` diff --git a/SpecForge-ext/docs/spec_bundle/index.html b/SpecForge-ext/docs/spec_bundle/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ad336a93a9fd136ac55768562e96a1d8f324d001 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/index.html @@ -0,0 +1,21 @@ + + + + + + + + + + + SpecBundle + + + +
+ + + + diff --git a/SpecForge-ext/docs/spec_bundle/package-lock.json b/SpecForge-ext/docs/spec_bundle/package-lock.json new file mode 100644 index 0000000000000000000000000000000000000000..806b12c06b0402351fc4f8a900bc7303a86faa6a --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/package-lock.json @@ -0,0 +1,1438 @@ +{ + "name": "specforge-spec-bundle", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "specforge-spec-bundle", + "version": "1.0.0", + "license": "MIT", + "dependencies": { + "echarts": "^6.0.0", + "papaparse": "^5.5.3", + "vue": "^3.5.24", + "vue-echarts": "^8.0.1", + "xlsx": "^0.18.5" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^6.0.1", + "vite": "^7.2.4" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmmirror.com/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmmirror.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.28.5", + "resolved": "https://registry.npmmirror.com/@babel/parser/-/parser-7.28.5.tgz", + "integrity": "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.5" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/types": { + "version": "7.28.5", + "resolved": "https://registry.npmmirror.com/@babel/types/-/types-7.28.5.tgz", + "integrity": "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz", + "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/android-arm/-/android-arm-0.25.12.tgz", + "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz", + "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/android-x64/-/android-x64-0.25.12.tgz", + "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz", + "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz", + "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz", + "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz", + "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz", + "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz", + "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz", + "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz", + "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz", + "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz", + "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz", + "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz", + "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz", + "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz", + "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz", + "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz", + "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz", + "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz", + "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz", + "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz", + "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz", + "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz", + "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmmirror.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "license": "MIT" + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.50", + "resolved": "https://registry.npmmirror.com/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.50.tgz", + "integrity": "sha512-5e76wQiQVeL1ICOZVUg4LSOVYg9jyhGCin+icYozhsUzM+fHE7kddi1bdiE0jwVqTfkjba3jUFbEkoC9WkdvyA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.53.3.tgz", + "integrity": "sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.53.3.tgz", + "integrity": "sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.53.3.tgz", + "integrity": "sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.53.3.tgz", + "integrity": "sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.53.3.tgz", + "integrity": "sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.53.3.tgz", + "integrity": "sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.53.3.tgz", + "integrity": "sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.53.3.tgz", + "integrity": "sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.53.3.tgz", + "integrity": "sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.53.3.tgz", + "integrity": "sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.53.3.tgz", + "integrity": "sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.53.3.tgz", + "integrity": "sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.53.3.tgz", + "integrity": "sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.53.3.tgz", + "integrity": "sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.53.3.tgz", + "integrity": "sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.53.3.tgz", + "integrity": "sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.53.3.tgz", + "integrity": "sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.53.3.tgz", + "integrity": "sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.53.3.tgz", + "integrity": "sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.53.3.tgz", + "integrity": "sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.53.3.tgz", + "integrity": "sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.53.3.tgz", + "integrity": "sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmmirror.com/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@vitejs/plugin-vue": { + "version": "6.0.2", + "resolved": "https://registry.npmmirror.com/@vitejs/plugin-vue/-/plugin-vue-6.0.2.tgz", + "integrity": "sha512-iHmwV3QcVGGvSC1BG5bZ4z6iwa1SOpAPWmnjOErd4Ske+lZua5K9TtAVdx0gMBClJ28DViCbSmZitjWZsWO3LA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@rolldown/pluginutils": "1.0.0-beta.50" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "peerDependencies": { + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0", + "vue": "^3.2.25" + } + }, + "node_modules/@vue/compiler-core": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-core/-/compiler-core-3.5.25.tgz", + "integrity": "sha512-vay5/oQJdsNHmliWoZfHPoVZZRmnSWhug0BYT34njkYTPqClh3DNWLkZNJBVSjsNMrg0CCrBfoKkjZQPM/QVUw==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@vue/shared": "3.5.25", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.1" + } + }, + "node_modules/@vue/compiler-dom": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-dom/-/compiler-dom-3.5.25.tgz", + "integrity": "sha512-4We0OAcMZsKgYoGlMjzYvaoErltdFI2/25wqanuTu+S4gismOTRTBPi4IASOjxWdzIwrYSjnqONfKvuqkXzE2Q==", + "license": "MIT", + "dependencies": { + "@vue/compiler-core": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-sfc/-/compiler-sfc-3.5.25.tgz", + "integrity": "sha512-PUgKp2rn8fFsI++lF2sO7gwO2d9Yj57Utr5yEsDf3GNaQcowCLKL7sf+LvVFvtJDXUp/03+dC6f2+LCv5aK1ag==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@vue/compiler-core": "3.5.25", + "@vue/compiler-dom": "3.5.25", + "@vue/compiler-ssr": "3.5.25", + "@vue/shared": "3.5.25", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.21", + "postcss": "^8.5.6", + "source-map-js": "^1.2.1" + } + }, + "node_modules/@vue/compiler-ssr": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-ssr/-/compiler-ssr-3.5.25.tgz", + "integrity": "sha512-ritPSKLBcParnsKYi+GNtbdbrIE1mtuFEJ4U1sWeuOMlIziK5GtOL85t5RhsNy4uWIXPgk+OUdpnXiTdzn8o3A==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/reactivity": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/reactivity/-/reactivity-3.5.25.tgz", + "integrity": "sha512-5xfAypCQepv4Jog1U4zn8cZIcbKKFka3AgWHEFQeK65OW+Ys4XybP6z2kKgws4YB43KGpqp5D/K3go2UPPunLA==", + "license": "MIT", + "dependencies": { + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/runtime-core": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/runtime-core/-/runtime-core-3.5.25.tgz", + "integrity": "sha512-Z751v203YWwYzy460bzsYQISDfPjHTl+6Zzwo/a3CsAf+0ccEjQ8c+0CdX1WsumRTHeywvyUFtW6KvNukT/smA==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/runtime-dom/-/runtime-dom-3.5.25.tgz", + "integrity": "sha512-a4WrkYFbb19i9pjkz38zJBg8wa/rboNERq3+hRRb0dHiJh13c+6kAbgqCPfMaJ2gg4weWD3APZswASOfmKwamA==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.25", + "@vue/runtime-core": "3.5.25", + "@vue/shared": "3.5.25", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/server-renderer": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/server-renderer/-/server-renderer-3.5.25.tgz", + "integrity": "sha512-UJaXR54vMG61i8XNIzTSf2Q7MOqZHpp8+x3XLGtE3+fL+nQd+k7O5+X3D/uWrnQXOdMw5VPih+Uremcw+u1woQ==", + "license": "MIT", + "dependencies": { + "@vue/compiler-ssr": "3.5.25", + "@vue/shared": "3.5.25" + }, + "peerDependencies": { + "vue": "3.5.25" + } + }, + "node_modules/@vue/shared": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/shared/-/shared-3.5.25.tgz", + "integrity": "sha512-AbOPdQQnAnzs58H2FrrDxYj/TJfmeS2jdfEEhgiKINy+bnOANmVizIEgq1r+C5zsbs6l1CCQxtcj71rwNQ4jWg==", + "license": "MIT" + }, + "node_modules/adler-32": { + "version": "1.3.1", + "resolved": "https://registry.npmmirror.com/adler-32/-/adler-32-1.3.1.tgz", + "integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/cfb": { + "version": "1.2.2", + "resolved": "https://registry.npmmirror.com/cfb/-/cfb-1.2.2.tgz", + "integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==", + "license": "Apache-2.0", + "dependencies": { + "adler-32": "~1.3.0", + "crc-32": "~1.2.0" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/codepage": { + "version": "1.15.0", + "resolved": "https://registry.npmmirror.com/codepage/-/codepage-1.15.0.tgz", + "integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/crc-32": { + "version": "1.2.2", + "resolved": "https://registry.npmmirror.com/crc-32/-/crc-32-1.2.2.tgz", + "integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==", + "license": "Apache-2.0", + "bin": { + "crc32": "bin/crc32.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmmirror.com/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "license": "MIT" + }, + "node_modules/echarts": { + "version": "6.0.0", + "resolved": "https://registry.npmmirror.com/echarts/-/echarts-6.0.0.tgz", + "integrity": "sha512-Tte/grDQRiETQP4xz3iZWSvoHrkCQtwqd6hs+mifXcjrCuo2iKWbajFObuLJVBlDIJlOzgQPd1hsaKt/3+OMkQ==", + "license": "Apache-2.0", + "dependencies": { + "tslib": "2.3.0", + "zrender": "6.0.0" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmmirror.com/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/esbuild": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/esbuild/-/esbuild-0.25.12.tgz", + "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.12", + "@esbuild/android-arm": "0.25.12", + "@esbuild/android-arm64": "0.25.12", + "@esbuild/android-x64": "0.25.12", + "@esbuild/darwin-arm64": "0.25.12", + "@esbuild/darwin-x64": "0.25.12", + "@esbuild/freebsd-arm64": "0.25.12", + "@esbuild/freebsd-x64": "0.25.12", + "@esbuild/linux-arm": "0.25.12", + "@esbuild/linux-arm64": "0.25.12", + "@esbuild/linux-ia32": "0.25.12", + "@esbuild/linux-loong64": "0.25.12", + "@esbuild/linux-mips64el": "0.25.12", + "@esbuild/linux-ppc64": "0.25.12", + "@esbuild/linux-riscv64": "0.25.12", + "@esbuild/linux-s390x": "0.25.12", + "@esbuild/linux-x64": "0.25.12", + "@esbuild/netbsd-arm64": "0.25.12", + "@esbuild/netbsd-x64": "0.25.12", + "@esbuild/openbsd-arm64": "0.25.12", + "@esbuild/openbsd-x64": "0.25.12", + "@esbuild/openharmony-arm64": "0.25.12", + "@esbuild/sunos-x64": "0.25.12", + "@esbuild/win32-arm64": "0.25.12", + "@esbuild/win32-ia32": "0.25.12", + "@esbuild/win32-x64": "0.25.12" + } + }, + "node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmmirror.com/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmmirror.com/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/frac": { + "version": "1.1.2", + "resolved": "https://registry.npmmirror.com/frac/-/frac-1.1.2.tgz", + "integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/magic-string": { + "version": "0.30.21", + "resolved": "https://registry.npmmirror.com/magic-string/-/magic-string-0.30.21.tgz", + "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.5" + } + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/papaparse": { + "version": "5.5.3", + "resolved": "https://registry.npmmirror.com/papaparse/-/papaparse-5.5.3.tgz", + "integrity": "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A==", + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmmirror.com/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "4.0.3", + "resolved": "https://registry.npmmirror.com/picomatch/-/picomatch-4.0.3.tgz", + "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/rollup": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/rollup/-/rollup-4.53.3.tgz", + "integrity": "sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.53.3", + "@rollup/rollup-android-arm64": "4.53.3", + "@rollup/rollup-darwin-arm64": "4.53.3", + "@rollup/rollup-darwin-x64": "4.53.3", + "@rollup/rollup-freebsd-arm64": "4.53.3", + "@rollup/rollup-freebsd-x64": "4.53.3", + "@rollup/rollup-linux-arm-gnueabihf": "4.53.3", + "@rollup/rollup-linux-arm-musleabihf": "4.53.3", + "@rollup/rollup-linux-arm64-gnu": "4.53.3", + "@rollup/rollup-linux-arm64-musl": "4.53.3", + "@rollup/rollup-linux-loong64-gnu": "4.53.3", + "@rollup/rollup-linux-ppc64-gnu": "4.53.3", + "@rollup/rollup-linux-riscv64-gnu": "4.53.3", + "@rollup/rollup-linux-riscv64-musl": "4.53.3", + "@rollup/rollup-linux-s390x-gnu": "4.53.3", + "@rollup/rollup-linux-x64-gnu": "4.53.3", + "@rollup/rollup-linux-x64-musl": "4.53.3", + "@rollup/rollup-openharmony-arm64": "4.53.3", + "@rollup/rollup-win32-arm64-msvc": "4.53.3", + "@rollup/rollup-win32-ia32-msvc": "4.53.3", + "@rollup/rollup-win32-x64-gnu": "4.53.3", + "@rollup/rollup-win32-x64-msvc": "4.53.3", + "fsevents": "~2.3.2" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/ssf": { + "version": "0.11.2", + "resolved": "https://registry.npmmirror.com/ssf/-/ssf-0.11.2.tgz", + "integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==", + "license": "Apache-2.0", + "dependencies": { + "frac": "~1.1.2" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/tinyglobby": { + "version": "0.2.15", + "resolved": "https://registry.npmmirror.com/tinyglobby/-/tinyglobby-0.2.15.tgz", + "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "fdir": "^6.5.0", + "picomatch": "^4.0.3" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/tslib": { + "version": "2.3.0", + "resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz", + "integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg==", + "license": "0BSD" + }, + "node_modules/vite": { + "version": "7.2.7", + "resolved": "https://registry.npmmirror.com/vite/-/vite-7.2.7.tgz", + "integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.5.0", + "picomatch": "^4.0.3", + "postcss": "^8.5.6", + "rollup": "^4.43.0", + "tinyglobby": "^0.2.15" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^20.19.0 || >=22.12.0", + "jiti": ">=1.21.0", + "less": "^4.0.0", + "lightningcss": "^1.21.0", + "sass": "^1.70.0", + "sass-embedded": "^1.70.0", + "stylus": ">=0.54.8", + "sugarss": "^5.0.0", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/vue": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/vue/-/vue-3.5.25.tgz", + "integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.25", + "@vue/compiler-sfc": "3.5.25", + "@vue/runtime-dom": "3.5.25", + "@vue/server-renderer": "3.5.25", + "@vue/shared": "3.5.25" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/vue-echarts": { + "version": "8.0.1", + "resolved": "https://registry.npmmirror.com/vue-echarts/-/vue-echarts-8.0.1.tgz", + "integrity": "sha512-23rJTFLu1OUEGRWjJGmdGt8fP+8+ja1gVgzMYPIPaHWpXegcO1viIAaeu2H4QHESlVeHzUAHIxKXGrwjsyXAaA==", + "license": "MIT", + "peerDependencies": { + "echarts": "^6.0.0", + "vue": "^3.3.0" + } + }, + "node_modules/wmf": { + "version": "1.0.2", + "resolved": "https://registry.npmmirror.com/wmf/-/wmf-1.0.2.tgz", + "integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/word": { + "version": "0.3.0", + "resolved": "https://registry.npmmirror.com/word/-/word-0.3.0.tgz", + "integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/xlsx": { + "version": "0.18.5", + "resolved": "https://registry.npmmirror.com/xlsx/-/xlsx-0.18.5.tgz", + "integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==", + "license": "Apache-2.0", + "dependencies": { + "adler-32": "~1.3.0", + "cfb": "~1.2.1", + "codepage": "~1.15.0", + "crc-32": "~1.2.1", + "ssf": "~0.11.2", + "wmf": "~1.0.1", + "word": "~0.3.0" + }, + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/zrender": { + "version": "6.0.0", + "resolved": "https://registry.npmmirror.com/zrender/-/zrender-6.0.0.tgz", + "integrity": "sha512-41dFXEEXuJpNecuUQq6JlbybmnHaqqpGlbH1yxnA5V9MMP4SbohSVZsJIwz+zdjQXSSlR1Vc34EgH1zxyTDvhg==", + "license": "BSD-3-Clause", + "dependencies": { + "tslib": "2.3.0" + } + } + } +} diff --git a/SpecForge-ext/docs/spec_bundle/package.json b/SpecForge-ext/docs/spec_bundle/package.json new file mode 100644 index 0000000000000000000000000000000000000000..d4657bbec62cbf2f39add1ccc35c9ce0c7eb412d --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/package.json @@ -0,0 +1,40 @@ +{ + "name": "specforge-spec-bundle", + "private": false, + "version": "1.0.0", + "description": "Interactive SpecBundle visualization dashboard for SpecForge", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview", + "deploy": "npm run build && gh-pages -d dist" + }, + "repository": { + "type": "git", + "url": "https://github.com/sgl-project/SpecForge.git", + "directory": "docs/spec_bundle" + }, + "keywords": [ + "specforge", + "specbundle", + "benchmark", + "visualization", + "speculative-decoding", + "llm", + "performance" + ], + "author": "SpecForge Team", + "license": "MIT", + "dependencies": { + "echarts": "^6.0.0", + "papaparse": "^5.5.3", + "vue": "^3.5.24", + "vue-echarts": "^8.0.1", + "xlsx": "^0.18.5" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^6.0.1", + "vite": "^7.2.4" + } +} diff --git a/SpecForge-ext/docs/spec_bundle/public/raw_data/data.json b/SpecForge-ext/docs/spec_bundle/public/raw_data/data.json new file mode 100644 index 0000000000000000000000000000000000000000..f923184be9f11a0be51daa926e7b94ff1821007b --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/public/raw_data/data.json @@ -0,0 +1,6422 @@ +{ + "Qwen3-30B-A3B-Instruct-2507": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1488.3645940190918, + "accept_length": 2.6400593352844486 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1499.6157892300257, + "accept_length": 3.0113471715954674 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1491.1759364152986, + "accept_length": 2.525104073618391 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1438.3989235515564, + "accept_length": 3.1488859094681736 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1478.3371126866896, + "accept_length": 2.515156901620291 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3022.302541558449, + "accept_length": 3.4018400160943374 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3458.7683757488517, + "accept_length": 4.5001277922609 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2710.0700446913434, + "accept_length": 3.83069810232181 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3636.1457092511932, + "accept_length": 5.29297884876688 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2650.9994915668844, + "accept_length": 3.981701201346221 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2048.689292397081, + "accept_length": 2.495847913511255 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2086.117426859236, + "accept_length": 2.831051301639537 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1698.4151046745978, + "accept_length": 2.5572219713355357 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1998.1600180425269, + "accept_length": 2.9819193324061195 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1742.9797705522778, + "accept_length": 2.7422317575874455 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2618.165602951494, + "accept_length": 3.349328692192939 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2912.1392571686956, + "accept_length": 4.384426363785289 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2367.016477367958, + "accept_length": 3.7901897758795298 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3069.9815866099266, + "accept_length": 5.124267515923567 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2363.3377665362655, + "accept_length": 4.030938739532834 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2911.405162351629, + "accept_length": 3.1783624121672447 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3265.2547245227543, + "accept_length": 4.018270197787462 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2455.0885550482017, + "accept_length": 3.295517305362425 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3413.029275629196, + "accept_length": 4.576331556763159 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2355.0941391264764, + "accept_length": 3.3973067623684012 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1778.9653109324079, + "accept_length": 2.0810309937160505 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1778.6778684706662, + "accept_length": 2.2730321793789288 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1652.1607344416184, + "accept_length": 2.2703352879266276 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1682.9566856293293, + "accept_length": 2.3032779273841584 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1753.6698041448958, + "accept_length": 2.6092096546804138 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2237.792328921565, + "accept_length": 2.5958448251993995 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2341.298191039886, + "accept_length": 3.0077922694984913 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1961.1700111065113, + "accept_length": 2.6947097860315505 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2310.2053834681674, + "accept_length": 3.216540452331778 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2008.7425535412629, + "accept_length": 2.91748293468006 + } + ] + } + ] + } + }, + "Qwen3-235B-A22B-Instruct-2507": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 633.4834448509783, + "accept_length": 2.356716526992789 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 718.620120234308, + "accept_length": 2.8762828246719394 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 619.3961515217887, + "accept_length": 2.5325967285309847 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 740.8090293617215, + "accept_length": 3.351527622767857 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 685.8224688133159, + "accept_length": 2.2254637464335056 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 718.5200251720828, + "accept_length": 2.5942242348162705 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 622.6877352310961, + "accept_length": 2.577754285484885 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 758.2839780669175, + "accept_length": 3.51144398279758 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 696.9862910262393, + "accept_length": 2.2957518385545184 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 692.54543613971, + "accept_length": 2.508131344520406 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 821.7716217768141, + "accept_length": 2.2131311175007076 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1165.3481778903413, + "accept_length": 3.2287879445239853 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 786.5291154131861, + "accept_length": 2.3811060693210626 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1263.6658286467714, + "accept_length": 4.021472447253628 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 729.1280796475185, + "accept_length": 2.1641727527768047 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1012.7228976076004, + "accept_length": 3.3166681444513406 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 801.9730196026575, + "accept_length": 2.4202165987905055 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1399.195876342606, + "accept_length": 4.477737029876627 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 728.5917394731794, + "accept_length": 2.180077789251727 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 966.5149174357106, + "accept_length": 3.0996346930308336 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 642.7287443329789, + "accept_length": 1.8722335837366109 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 814.539845630713, + "accept_length": 2.3454133346915906 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 617.9738942581079, + "accept_length": 1.9436368219822697 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 779.531140147999, + "accept_length": 2.571956737666924 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 579.7478777831109, + "accept_length": 1.879637550849381 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 684.112380410899, + "accept_length": 2.3538604252889965 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 607.3644823224199, + "accept_length": 1.9674055586107704 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 789.9679697718769, + "accept_length": 2.6698328935795956 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 596.0590450290033, + "accept_length": 1.987328547838102 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 670.0058040199536, + "accept_length": 2.329033512672587 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 866.1813723921825, + "accept_length": 2.533027363039563 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1068.373749600453, + "accept_length": 3.238804311590177 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 853.4917713020631, + "accept_length": 2.8369721532226433 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1176.5192650014792, + "accept_length": 4.083723300745958 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 772.1684975661775, + "accept_length": 2.5123042505592843 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1032.477913431608, + "accept_length": 3.6360244115082825 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 889.8951303902317, + "accept_length": 2.955997016746898 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1267.5178598410528, + "accept_length": 4.4874762125186445 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 736.1010265214783, + "accept_length": 2.3861131594156686 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 983.9906558013464, + "accept_length": 3.412326127536581 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 803.7805606947842, + "accept_length": 2.090690935434212 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1062.9796952555507, + "accept_length": 2.8172381425652917 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 759.6333115912107, + "accept_length": 2.2179516111790765 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1093.1979234549972, + "accept_length": 3.268498808394456 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 708.4447966909656, + "accept_length": 2.077364507787014 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 874.062642276262, + "accept_length": 2.6670587896561795 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 767.8685797664081, + "accept_length": 2.2474642743536366 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1155.6572987907093, + "accept_length": 3.490068495285106 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 711.4663371023372, + "accept_length": 2.129619842542645 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 835.6105646149398, + "accept_length": 2.590646146520392 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 689.4282413740445, + "accept_length": 1.941237358311274 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 872.4508905377182, + "accept_length": 2.556773924332344 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 636.4408069963314, + "accept_length": 2.027268079304664 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 885.529748337286, + "accept_length": 2.8442245393804413 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 642.4958901994291, + "accept_length": 2.0553746448296777 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 730.7331843587357, + "accept_length": 2.4330876223070512 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 641.1037073226237, + "accept_length": 2.0361251069493296 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 889.0304393086461, + "accept_length": 2.965008914078923 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 654.3422430101997, + "accept_length": 2.1356956699218137 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 742.3749721046132, + "accept_length": 2.5176210584474528 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 716.6967887897075, + "accept_length": 2.0240035915598344 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 823.4218898853592, + "accept_length": 2.356617214868455 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 680.2044274358036, + "accept_length": 2.14011469258975 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 808.934577824737, + "accept_length": 2.6032639643837037 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 630.9312870281678, + "accept_length": 1.9776516235921864 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 698.9315763256182, + "accept_length": 2.2587729126518172 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 685.8554308455039, + "accept_length": 2.1591340093176212 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 826.5168292170538, + "accept_length": 2.6672259363465063 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 636.0480501999019, + "accept_length": 2.001480647431386 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 683.7427107159214, + "accept_length": 2.241436629482574 + } + ] + } + ] + } + }, + "Qwen3-Next-80B-A3B-Instruct-FP8": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 683.8795985073891, + "accept_length": 3.13391215089175 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 753.237074543623, + "accept_length": 3.9038018228889597 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 746.7222279174218, + "accept_length": 4.022678679117706 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 771.153101164556, + "accept_length": 4.345554699994077 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 773.4012327870145, + "accept_length": 4.607604467310829 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1478.3001038430784, + "accept_length": 3.498551418454351 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1764.2064514729698, + "accept_length": 4.677160426045899 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1758.0166003158934, + "accept_length": 4.755809947207558 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1912.6838622508392, + "accept_length": 5.554967332076544 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1853.434631732593, + "accept_length": 5.756492370623537 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1095.5102974622082, + "accept_length": 2.581125058112506 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1157.636689246293, + "accept_length": 2.9156972910237133 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1197.112468072539, + "accept_length": 3.1331585165547646 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1127.4364940073876, + "accept_length": 3.0475279197966354 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1198.9417562126052, + "accept_length": 3.4190589216409535 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1245.6702060145312, + "accept_length": 3.4647713687985653 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1527.7120587214345, + "accept_length": 4.612265133111893 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1536.7723048769212, + "accept_length": 4.676180904522613 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1628.1293604862747, + "accept_length": 5.4577785667790994 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1629.7244930267507, + "accept_length": 5.621873496873497 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1463.1234977160723, + "accept_length": 3.1058026902179443 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1724.2207417984275, + "accept_length": 3.8462516284893944 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1734.4894352951553, + "accept_length": 3.9821418050654955 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1786.8774464735384, + "accept_length": 4.2761952310299485 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1829.5532782765572, + "accept_length": 4.590307145700787 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 986.4282909200625, + "accept_length": 2.0752097090844193 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 981.0983772859984, + "accept_length": 2.1801329261720857 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1057.6549922432027, + "accept_length": 2.439575219817722 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 956.6098887389447, + "accept_length": 2.2457481515800852 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1041.5277102267419, + "accept_length": 2.606484877248997 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1368.9499756838852, + "accept_length": 2.7362548025140208 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1457.9918429280988, + "accept_length": 3.1803662497541225 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1511.274616068283, + "accept_length": 3.3682366894832594 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1463.9444559000415, + "accept_length": 3.380290412894046 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1541.4580844550508, + "accept_length": 3.7385501251645787 + } + ] + } + ] + } + }, + "Qwen3-Coder-30B-A3B-Instruct": { + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2621.7139434700584, + "accept_length": 3.394971072541166 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2966.4459091363574, + "accept_length": 4.5011526953450725 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2236.868611380527, + "accept_length": 3.9489230027326796 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 3205.2025971977832, + "accept_length": 5.306789266712931 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2553.012134540716, + "accept_length": 4.221071958746777 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2992.02067556649, + "accept_length": 3.138553878632709 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 3328.9058789398114, + "accept_length": 3.9449129401751835 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2541.3931549111803, + "accept_length": 3.336379596827288 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 3472.3919294148427, + "accept_length": 4.477776008915068 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2552.5518885328293, + "accept_length": 3.5865930607956185 + } + ] + } + ] + } + }, + "Qwen3-Coder-480B-A35B-Instruct": { + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 867.5261370310272, + "accept_length": 3.4954686382065345 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 1044.4475556194586, + "accept_length": 4.68614810868407 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 945.2207076385645, + "accept_length": 4.2835241878943675 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 1165.0727231905212, + "accept_length": 5.626203379024545 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 956.5336674844815, + "accept_length": 4.574128043621322 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 846.6405796214389, + "accept_length": 3.0936425388083757 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 946.3806786937351, + "accept_length": 3.8547162126548313 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 817.5432981932123, + "accept_length": 3.3539182909649066 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 983.2554936551461, + "accept_length": 4.260473117512835 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 790.2818911646486, + "accept_length": 3.379611891844464 + } + ] + } + ] + } + }, + "Kimi-K2-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 498.355967400969, + "accept_length": 3.271389121751566 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 538.7660861191819, + "accept_length": 4.120435815920245 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 476.5166831456105, + "accept_length": 3.5748305647840533 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 544.16588655688, + "accept_length": 4.655279611582661 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 459.1757114935756, + "accept_length": 3.4419677544677545 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 877.2113745892083, + "accept_length": 3.46806357521281 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 995.8769550545389, + "accept_length": 4.610169876195772 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 772.6100737625807, + "accept_length": 3.527844083399639 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 1022.7285831443611, + "accept_length": 5.383128673454291 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 649.083231514055, + "accept_length": 3.1435862587473253 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 533.8166177911393, + "accept_length": 2.3897198230461343 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 526.1187611377575, + "accept_length": 2.738876732312181 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 473.3129895327435, + "accept_length": 2.394141207153502 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 488.46384825810924, + "accept_length": 2.7821796546219706 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 451.126180366313, + "accept_length": 2.536454493323503 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 779.7838793636296, + "accept_length": 3.364936827816644 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 868.550857852841, + "accept_length": 4.423030465709301 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 729.1217213710999, + "accept_length": 3.7321711568938194 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 897.9039799990946, + "accept_length": 5.162398550153652 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 669.271164663664, + "accept_length": 3.7044178210408085 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 841.5023790421864, + "accept_length": 3.162685632492396 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 904.3910288246204, + "accept_length": 3.943605886942718 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 716.7319007181034, + "accept_length": 3.1374681580049573 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 896.7006322822839, + "accept_length": 4.400262176061309 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 650.4333056536461, + "accept_length": 3.0780193205478037 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 647.3644717982133, + "accept_length": 2.9848269628099175 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 660.0254297132984, + "accept_length": 3.594056395834917 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 523.0340443308603, + "accept_length": 2.8796471741261027 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 630.5425124127137, + "accept_length": 3.944647875329984 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 389.47080223360666, + "accept_length": 2.5096594789735582 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 783.436424568974, + "accept_length": 2.904452196823693 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 811.3642458480507, + "accept_length": 3.4622853609057755 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 699.8111934038128, + "accept_length": 3.0198274205132876 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 770.4892578818251, + "accept_length": 3.6995331477421103 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 596.3162033813331, + "accept_length": 2.7901899604967983 + } + ] + } + ] + } + }, + "Ling-flash-2.0": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1144.7606179148752, + "accept_length": 3.4351661916604646 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1253.4000030615975, + "accept_length": 4.487906489549112 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1059.7381115819003, + "accept_length": 3.331830155824441 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1323.0093663978187, + "accept_length": 5.148644964283767 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1026.8025294413142, + "accept_length": 3.126593214481735 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1434.6065070935829, + "accept_length": 3.4340471141971713 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1607.3212268988339, + "accept_length": 4.493397164127635 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1383.6720582197756, + "accept_length": 3.7931376508179415 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1685.5692612687462, + "accept_length": 5.218245374511558 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1330.1086623703009, + "accept_length": 3.793696144088135 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1022.5890920470158, + "accept_length": 2.392568385378843 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 990.0430932236113, + "accept_length": 2.648161574313827 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 914.3899001110539, + "accept_length": 2.5161251562049407 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 942.3914903299366, + "accept_length": 2.771332137960131 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 968.0479918450316, + "accept_length": 2.8558805412179527 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1271.2889448808319, + "accept_length": 3.1471241394625804 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1353.1437889143726, + "accept_length": 3.9318483282257697 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1175.4192382338058, + "accept_length": 3.29687986547923 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1358.9726439538854, + "accept_length": 4.370163501574083 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1141.7913416362687, + "accept_length": 3.3590013964490297 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1305.1833791876973, + "accept_length": 2.9790301516097895 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1366.417326281792, + "accept_length": 3.6103649876590875 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1130.7868943433502, + "accept_length": 2.8933133857317164 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1345.6741018953574, + "accept_length": 3.9330923185867093 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1061.6897228931932, + "accept_length": 2.902182106883942 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 863.8565336005082, + "accept_length": 1.907102314310342 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 833.1235940586521, + "accept_length": 2.047546254809973 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 798.9811798480557, + "accept_length": 1.9372590117256243 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 763.2761511276084, + "accept_length": 2.0470985454359427 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 779.3060665006524, + "accept_length": 2.045476819601249 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1185.7250147683403, + "accept_length": 2.562389392369937 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1161.8732670284553, + "accept_length": 2.886871902842324 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1052.640023467198, + "accept_length": 2.6017604302340236 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1111.996259596397, + "accept_length": 3.0648124985786733 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1004.4992021266573, + "accept_length": 2.6709053367549105 + } + ] + } + ] + } + }, + "Llama-3.1-8B-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 228.64232714994796, + "accept_length": 1.7165139181419709 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 321.2528041157779, + "accept_length": 2.5481878001819607 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 213.550264904667, + "accept_length": 1.7634936642258956 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 329.6873220645443, + "accept_length": 2.8537845395516377 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 195.13619448514442, + "accept_length": 1.7528912619638426 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 251.43922505539766, + "accept_length": 2.2820562939796716 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 197.901650893672, + "accept_length": 1.7742552127753433 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 317.61058794222197, + "accept_length": 2.9733251079580505 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 182.0257072155964, + "accept_length": 1.789228234172427 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 240.85801894998306, + "accept_length": 2.367398432594591 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 399.2995452070592, + "accept_length": 2.7825411590459592 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 492.28246574028134, + "accept_length": 3.4786948176583494 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 422.40466722576286, + "accept_length": 3.254684892147128 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 594.5033645961273, + "accept_length": 4.624857400180126 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 387.0489467031037, + "accept_length": 3.3070174292508296 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 480.43534296060534, + "accept_length": 4.116159164796923 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 413.57783551553456, + "accept_length": 3.489213277012106 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 638.0439777096752, + "accept_length": 5.402844266750837 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 326.8790406711244, + "accept_length": 3.072066504990206 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 453.306808098541, + "accept_length": 4.25573095185686 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 414.90616666264776, + "accept_length": 2.930670028119849 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 404.24667749722187, + "accept_length": 2.8980726819445777 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 453.73692243041774, + "accept_length": 3.554148008484563 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 446.6366476858434, + "accept_length": 3.5164393144456105 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 338.6308027570883, + "accept_length": 2.9393909722902185 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 346.46724606666106, + "accept_length": 3.0061221366256823 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 454.730035166582, + "accept_length": 3.906676145543851 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 450.03198538047087, + "accept_length": 3.855839765261211 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 305.1648971387325, + "accept_length": 2.9089536379397125 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 308.00561770283963, + "accept_length": 2.938163437236731 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 432.8677712430711, + "accept_length": 3.0469174293472796 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 465.1765542307934, + "accept_length": 3.3398192040568846 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 479.1212006261437, + "accept_length": 3.7445769729930163 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 548.9370103875078, + "accept_length": 4.318366474235621 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 340.2704451839945, + "accept_length": 2.9425913908717285 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 377.47349118830954, + "accept_length": 3.2519286521546853 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 480.3152659024827, + "accept_length": 4.0959237477185155 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 571.4886457684788, + "accept_length": 4.910129659643436 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 311.1051926955927, + "accept_length": 2.9338537387017256 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 330.15665770360005, + "accept_length": 3.126203604641593 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 380.6915537026263, + "accept_length": 2.6893540748536475 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 439.67672671912396, + "accept_length": 3.16861704188786 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 398.3738662742165, + "accept_length": 3.1199565043209523 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 506.22686693578754, + "accept_length": 3.9957244075250427 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 322.29847741557273, + "accept_length": 2.771756050751679 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 375.34956052924895, + "accept_length": 3.236171472299629 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 391.25705242634194, + "accept_length": 3.334862665932587 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 516.904537338255, + "accept_length": 4.466856034741759 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 287.68205157705233, + "accept_length": 2.7148899046029547 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 378.8468257829908, + "accept_length": 3.585376494197714 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 237.18050733350836, + "accept_length": 1.713236561734993 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 258.6437346257605, + "accept_length": 1.9050339301460721 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 226.67848476067016, + "accept_length": 1.8075300109130592 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 254.48969338840087, + "accept_length": 2.043805528134255 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 210.94791438286492, + "accept_length": 1.8654798891594593 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 251.07710462288492, + "accept_length": 2.2264818220398923 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 211.18454065719607, + "accept_length": 1.8434056761268782 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 240.6034453504167, + "accept_length": 2.1029710512950737 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 183.72672690273865, + "accept_length": 1.7817737292479987 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 229.82170237350869, + "accept_length": 2.250341575212658 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 409.86415544506445, + "accept_length": 2.8552892726009724 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 442.54523731909666, + "accept_length": 3.135712400558006 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 438.0519648397228, + "accept_length": 3.3792158666871135 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 507.1290934019136, + "accept_length": 3.936040126357265 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 352.1689105895484, + "accept_length": 3.026258098612226 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 413.1686528229548, + "accept_length": 3.5475168823860437 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 434.1788724748705, + "accept_length": 3.6819800875461333 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 514.2312383540044, + "accept_length": 4.357665531437638 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 311.5910755177637, + "accept_length": 2.9283727399165507 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 390.64506651929287, + "accept_length": 3.692280754414928 + } + ] + } + ] + } + }, + "Llama-3.3-70B-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 521.4502791575164, + "accept_length": 1.2760798037239203 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Spec for ge", + "output_throughput": 837.9426300003847, + "accept_length": 2.3179247901200304 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 500.5534332009228, + "accept_length": 1.2836005168205962 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 855.6400225608106, + "accept_length": 2.4851382017038057 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 500.33326156436937, + "accept_length": 1.3482255389718076 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 758.9001336688345, + "accept_length": 2.12511673151751 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 483.12653680688, + "accept_length": 1.2856745693167546 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 820.5175400063332, + "accept_length": 2.516910489405022 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 480.4218686725539, + "accept_length": 1.3936331604189096 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 739.405741336959, + "accept_length": 2.222061210294459 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1088.844896763402, + "accept_length": 2.3720131878590123 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1273.7733416283656, + "accept_length": 2.841736535013628 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1122.2476729474943, + "accept_length": 2.5920045204124875 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1382.9357431087456, + "accept_length": 3.243898689873717 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1112.8479569335152, + "accept_length": 2.792588962605549 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1274.2110431983278, + "accept_length": 3.2416170775479363 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1079.9951811356827, + "accept_length": 2.6718376973892366 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1327.6044700788502, + "accept_length": 3.3766338373668217 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1090.3170854344964, + "accept_length": 2.966812280063099 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1215.8347875575441, + "accept_length": 3.3641021480547684 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1234.647877556777, + "accept_length": 2.9232673267326734 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1238.4736758319698, + "accept_length": 2.9606951984177083 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1377.8052334866013, + "accept_length": 3.5324281309061973 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1409.5100765643524, + "accept_length": 3.6175162329362442 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1129.6661036217977, + "accept_length": 3.143848893296669 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1108.3072501756835, + "accept_length": 3.2248797608215263 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1425.2993761886291, + "accept_length": 3.8789368991048736 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1440.3671955624673, + "accept_length": 3.97791186891054 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1069.4986663607351, + "accept_length": 3.1943331425300516 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1033.773238205561, + "accept_length": 3.2422141262192974 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1194.0875984832494, + "accept_length": 2.6663626344392504 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1290.1122375104421, + "accept_length": 2.925804965875309 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1282.7936401185236, + "accept_length": 3.0671719811813904 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1426.372333907719, + "accept_length": 3.436568804650481 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1090.1088508973057, + "accept_length": 2.8127895941495002 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1174.0867819009864, + "accept_length": 3.0611013660766493 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1267.8737053510965, + "accept_length": 3.1906793120660706 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1407.8140138598972, + "accept_length": 3.6735002608242047 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1013.2705272855593, + "accept_length": 2.7776112847805305 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 968.2027451202639, + "accept_length": 2.742653690956563 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1210.6010917932015, + "accept_length": 2.723797958423008 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1295.014267720614, + "accept_length": 2.952023346303502 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1303.4195570335166, + "accept_length": 3.133414966360772 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1423.2736941362525, + "accept_length": 3.4980468448438247 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1070.711661408102, + "accept_length": 2.735034762087001 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1154.785652335772, + "accept_length": 2.9811645516106386 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1279.5345355421975, + "accept_length": 3.284394784770605 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1399.3991191944933, + "accept_length": 3.716324359708698 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1013.3765756840332, + "accept_length": 2.773990564681233 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1035.4140338795994, + "accept_length": 2.933293078243183 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 704.0737829344649, + "accept_length": 1.645732050137249 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 936.4940018423655, + "accept_length": 2.2541347317466722 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 684.0195321200449, + "accept_length": 1.702027072988232 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 933.0572305312112, + "accept_length": 2.39442380929992 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 618.4946534541955, + "accept_length": 1.7860533893688224 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 700.886442439991, + "accept_length": 2.281622206910129 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 652.1412786559076, + "accept_length": 1.7116903633491312 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 887.7001871678323, + "accept_length": 2.452738257649581 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 635.2599880909434, + "accept_length": 1.9610333607746286 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 854.0347909075315, + "accept_length": 2.589833798374378 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 962.5545831639148, + "accept_length": 2.0451300999292217 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1020.0538308626681, + "accept_length": 2.1911976817371235 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 963.8356757692138, + "accept_length": 2.1687507495755036 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1039.643962895085, + "accept_length": 2.3552079123829617 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 890.1003387342033, + "accept_length": 2.226321240698847 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 960.5616523564485, + "accept_length": 2.4811411267352264 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 916.6826693888017, + "accept_length": 2.1849745643049188 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 984.4877550429275, + "accept_length": 2.4152394292465176 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 838.0962787179271, + "accept_length": 2.3145643059121785 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 924.0808096194634, + "accept_length": 2.573260793115575 + } + ] + } + ] + } + }, + "Llama-4-Scout-17B-16E-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 816.6176343207234, + "accept_length": 2.435108707729916 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 908.8655650704263, + "accept_length": 3.1118742007294085 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 806.5328373116205, + "accept_length": 2.6234459324405357 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 971.8534490877095, + "accept_length": 3.8715801886792454 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 708.8133468064259, + "accept_length": 2.146746247607535 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 818.3072714693558, + "accept_length": 2.918526679710503 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 765.9810114809961, + "accept_length": 2.675257522087863 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 957.227019602509, + "accept_length": 4.307217442700466 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 675.0775309782273, + "accept_length": 2.144316290813106 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 814.5839518607636, + "accept_length": 2.627502101582583 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1478.9989946720648, + "accept_length": 2.366719134681358 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1884.3462895109676, + "accept_length": 3.238557789111507 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1447.5513200323323, + "accept_length": 2.5898901840327406 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2100.7682204066577, + "accept_length": 4.153214423200308 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1199.1485073659853, + "accept_length": 2.489558557182447 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1457.2169829849418, + "accept_length": 3.2046972238757507 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1330.0337890073868, + "accept_length": 2.648556845221877 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2110.3314050998847, + "accept_length": 4.7805795395081105 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1153.7706965189202, + "accept_length": 2.6314392278632304 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1369.6607164745208, + "accept_length": 3.2076523352436657 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1252.9681990096112, + "accept_length": 2.3541095408844828 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1302.3829223511154, + "accept_length": 2.4913843888070693 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1225.4607594389363, + "accept_length": 2.5648559607722956 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1312.399917450856, + "accept_length": 2.836414637256152 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 953.148992300308, + "accept_length": 2.222710749523974 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 967.1281111811169, + "accept_length": 2.3256101583113455 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1157.0433602013916, + "accept_length": 2.649528603387664 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1276.9552963643773, + "accept_length": 3.0189181867437243 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 940.9893388280037, + "accept_length": 2.3959043407227965 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1010.4098410869198, + "accept_length": 2.7008052625609618 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1515.800628974162, + "accept_length": 2.664927494512612 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1749.0012751674196, + "accept_length": 3.224152798137449 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1556.515161340629, + "accept_length": 3.085438335809807 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1921.2922045342316, + "accept_length": 4.140846637369973 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1201.849883743592, + "accept_length": 2.6006220481511346 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1393.1592557980014, + "accept_length": 3.1744799971652315 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1456.346786965349, + "accept_length": 3.2582381225462083 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1944.8214954525663, + "accept_length": 4.7947306331104995 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1109.058302621911, + "accept_length": 2.6508010386556267 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1234.7042057027743, + "accept_length": 3.0442784990549376 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1598.2921930690502, + "accept_length": 2.487202280374381 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1933.9962764283844, + "accept_length": 3.14740116583215 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1601.2688464385185, + "accept_length": 2.8043640587405627 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2144.3319751584095, + "accept_length": 3.983057732747085 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1051.7266219288254, + "accept_length": 2.1138485934104656 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1320.656674087923, + "accept_length": 2.7145795398417976 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1501.558947290443, + "accept_length": 2.929916684169992 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2170.188140733029, + "accept_length": 4.55060712303548 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1009.5574686537159, + "accept_length": 2.2590065740745002 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1249.8114756626915, + "accept_length": 2.8130523194007555 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1022.713052476267, + "accept_length": 1.7952034022379475 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1189.61672405822, + "accept_length": 2.2164571332464367 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 963.8209003406079, + "accept_length": 1.8240590609583607 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1171.8275957081507, + "accept_length": 2.408275220827522 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 755.8055387643059, + "accept_length": 1.780077619663648 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 887.65933899505, + "accept_length": 2.1907344347752975 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 885.0003924094965, + "accept_length": 1.864155494076754 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1084.5573704005851, + "accept_length": 2.459442783236034 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 773.7660016870891, + "accept_length": 2.05643096671835 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 838.3207906571789, + "accept_length": 2.1910908349096845 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1320.0198779778916, + "accept_length": 2.0166714112874526 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1482.2781495871964, + "accept_length": 2.3200242800296755 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1258.0775283103167, + "accept_length": 2.135039169677331 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1468.3432054658438, + "accept_length": 2.5528455284552845 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1405.110892125768, + "accept_length": 2.8834021014937705 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1502.213627081269, + "accept_length": 3.0623772161357583 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1148.5409144989237, + "accept_length": 2.1684843736177633 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1379.1223204247422, + "accept_length": 2.672381928590287 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1345.7377508882935, + "accept_length": 3.044341630328194 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1474.1967930541948, + "accept_length": 3.315005686664771 + } + ] + } + ] + } + } +} diff --git a/SpecForge-ext/docs/spec_bundle/public/vite.svg b/SpecForge-ext/docs/spec_bundle/public/vite.svg new file mode 100644 index 0000000000000000000000000000000000000000..ee9fadaf9c4a762ac0ec010ca16ce8fa39a09e56 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/public/vite.svg @@ -0,0 +1 @@ + diff --git a/SpecForge-ext/docs/spec_bundle/src/App.vue b/SpecForge-ext/docs/spec_bundle/src/App.vue new file mode 100644 index 0000000000000000000000000000000000000000..9dcb55de3bd93c89fa2d56f202e66a7dce8f7bb3 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/App.vue @@ -0,0 +1,17 @@ + + + + + diff --git a/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkChart.vue b/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkChart.vue new file mode 100644 index 0000000000000000000000000000000000000000..c8022be25e3da8ad72760690c177075d12ec607c --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkChart.vue @@ -0,0 +1,289 @@ + + + + + diff --git a/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkDashboard.vue b/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkDashboard.vue new file mode 100644 index 0000000000000000000000000000000000000000..a5d33cc912211e9311f78422b0436a3e605a0dbd --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkDashboard.vue @@ -0,0 +1,601 @@ + + + + + diff --git a/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkTable.vue b/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkTable.vue new file mode 100644 index 0000000000000000000000000000000000000000..69bd280e025b1158882f77828e850959a51e3876 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/components/BenchmarkTable.vue @@ -0,0 +1,364 @@ + + + + + diff --git a/SpecForge-ext/docs/spec_bundle/src/components/FilterControls.vue b/SpecForge-ext/docs/spec_bundle/src/components/FilterControls.vue new file mode 100644 index 0000000000000000000000000000000000000000..d5b2ff6a43807e872b1c6e316e0f420ad526fa19 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/components/FilterControls.vue @@ -0,0 +1,189 @@ + + + + + diff --git a/SpecForge-ext/docs/spec_bundle/src/components/HelloWorld.vue b/SpecForge-ext/docs/spec_bundle/src/components/HelloWorld.vue new file mode 100644 index 0000000000000000000000000000000000000000..546ebbc624b0e3baf58efc6a8dd149ac5e6074e6 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/components/HelloWorld.vue @@ -0,0 +1,43 @@ + + + + + diff --git a/SpecForge-ext/docs/spec_bundle/src/main.js b/SpecForge-ext/docs/spec_bundle/src/main.js new file mode 100644 index 0000000000000000000000000000000000000000..2425c0f745bef4d009cb6661b62fd9dfd62960b0 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/main.js @@ -0,0 +1,5 @@ +import { createApp } from 'vue' +import './style.css' +import App from './App.vue' + +createApp(App).mount('#app') diff --git a/SpecForge-ext/docs/spec_bundle/src/style.css b/SpecForge-ext/docs/spec_bundle/src/style.css new file mode 100644 index 0000000000000000000000000000000000000000..7d3583b0d14cd41cf4c13824b049e80b7e16f17b --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/style.css @@ -0,0 +1,82 @@ +:root { + --font-sans: 'Inter', system-ui, -apple-system, sans-serif; + --font-display: 'Outfit', system-ui, -apple-system, sans-serif; + + --color-primary: #4F46E5; + /* Indigo 600 */ + --color-primary-dark: #4338CA; + --color-primary-light: #818CF8; + + --color-background: #F8FAFC; + /* Slate 50 */ + --color-surface: #FFFFFF; + + --color-text-main: #0F172A; + /* Slate 900 */ + --color-text-secondary: #64748B; + /* Slate 500 */ + --color-text-muted: #94A3B8; + /* Slate 400 */ + + --color-success: #10B981; + --color-warning: #F59E0B; + --color-danger: #EF4444; + + /* Premium Shadows - Softer and tinted */ + --shadow-sm: 0 1px 2px 0 rgba(15, 23, 42, 0.05); + --shadow-md: 0 4px 6px -1px rgba(15, 23, 42, 0.05), 0 2px 4px -2px rgba(15, 23, 42, 0.05); + --shadow-lg: 0 10px 15px -3px rgba(15, 23, 42, 0.05), 0 4px 6px -4px rgba(15, 23, 42, 0.04); + --shadow-xl: 0 20px 25px -5px rgba(15, 23, 42, 0.05), 0 8px 10px -6px rgba(15, 23, 42, 0.04); + + /* Feature Shadow - Diffuse Glow */ + --shadow-glow: 0 0 40px -10px rgba(79, 70, 229, 0.15); + + /* Tighter, more technical radii */ + --radius-lg: 8px; + --radius-xl: 12px; + --radius-2xl: 16px; +} + +* { + box-sizing: border-box; + margin: 0; + padding: 0; +} + +body { + font-family: var(--font-sans); + background-color: var(--color-background); + color: var(--color-text-main); + line-height: 1.5; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} + +button { + cursor: pointer; + font-family: inherit; +} + +a { + color: var(--color-primary); + text-decoration: none; +} + +/* Custom Scrollbar */ +::-webkit-scrollbar { + width: 8px; + height: 8px; +} + +::-webkit-scrollbar-track { + background: transparent; +} + +::-webkit-scrollbar-thumb { + background: #cbd5e1; + border-radius: 4px; +} + +::-webkit-scrollbar-thumb:hover { + background: #94a3b8; +} diff --git a/SpecForge-ext/docs/spec_bundle/src/utils/dataProcessor.js b/SpecForge-ext/docs/spec_bundle/src/utils/dataProcessor.js new file mode 100644 index 0000000000000000000000000000000000000000..87ae9f586d99202e532ebd8ad07b0d47d5e3e7d7 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/src/utils/dataProcessor.js @@ -0,0 +1,117 @@ +export async function loadAllData() { + try { + const response = await fetch('./raw_data/data.json'); + const jsonData = await response.json(); + return jsonData; + } catch (error) { + console.error('Error loading JSON data:', error); + return {}; + } +} + +export function calculateSpeedup(specValue, baselineValue) { + if (!specValue || !baselineValue || baselineValue === 0) return null; + return (specValue / baselineValue).toFixed(2); +} + +export function processModelData(modelData, targetModelName) { + if (!modelData || !targetModelName) return []; + + // Map to hold aggregated entries by unique key (draftModel + config) + const entriesMap = new Map(); + + // Iterate through each benchmark in the model + Object.entries(modelData).forEach(([, benchmarkData]) => { + const benchmarkName = benchmarkData.benchmark_name; + const results = benchmarkData.results || []; + + results.forEach(result => { + const { batch_size, steps, topk, num_draft_tokens, metrics } = result; + + // Find baseline (Without EAGLE3) + const baselineMetric = metrics.find(m => m.Name === 'Wihtout EAGLE3'); + + // Process each metric entry (including baseline and EAGLE3 models) + metrics.forEach(metric => { + const isBaseline = metric.Name === 'Wihtout EAGLE3'; + const config = isBaseline ? 'baseline' : `${batch_size}-${steps}-${topk}-${num_draft_tokens}`; + + // draftModel is the Name from metrics array + const draftModel = isBaseline ? 'None' : metric.Name; + + // Use a combination of draftModel and config as the key + // This ensures baseline and EAGLE3 configs are separate entries + const key = `${draftModel}|${config}`; + + // Get or create entry + if (!entriesMap.has(key)) { + entriesMap.set(key, { + targetModel: targetModelName, + draftModel: draftModel, + config, + batch_size, + steps, + topk, + num_draft_tokens, + metrics: {}, + baseline: {} + }); + } + + const entry = entriesMap.get(key); + + // Add this benchmark's metrics + entry.metrics[benchmarkName] = { + throughput: metric.output_throughput, + accLen: metric.accept_length + }; + + // Add baseline for this benchmark + if (baselineMetric) { + entry.baseline[benchmarkName] = { + throughput: baselineMetric.output_throughput, + accLen: baselineMetric.accept_length + }; + } + }); + }); + }); + + return Array.from(entriesMap.values()); +} + +export function getTargetModels(allData) { + return Object.keys(allData); +} + +export function extractUniqueTargetModels(processedData) { + return [...new Set(processedData.map(d => d.targetModel).filter(Boolean))]; +} + +export function removeSGLangPrefix(modelName) { + if (!modelName) return modelName; + // Remove "SGLang-EAGLE3" prefix if present (handles various formats) + // Examples: "lmsys/SGLang-EAGLE3-..." -> "lmsys/..." + // "SGLang-EAGLE3/..." -> "..." + // "SGLang-EAGLE3-..." -> "..." + let cleaned = String(modelName); + + // Remove "SGLang-EAGLE3-" pattern (with hyphen after, can be preceded by / or start of string) + cleaned = cleaned.replace(/(^|\/)SGLang-EAGLE3-/gi, '$1'); + + // Remove "SGLang-EAGLE3/" pattern (with slash after) + cleaned = cleaned.replace(/(^|\/)SGLang-EAGLE3\//gi, '$1'); + + // Remove standalone "SGLang-EAGLE3" at the start (not followed by - or /) + cleaned = cleaned.replace(/^SGLang-EAGLE3(?![-\/])/gi, ''); + + // Clean up any double slashes + cleaned = cleaned.replace(/\/+/g, '/'); + + // Remove leading slash if present (unless it's the only character) + if (cleaned.length > 1) { + cleaned = cleaned.replace(/^\//, ''); + } + + return cleaned || modelName; +} diff --git a/SpecForge-ext/docs/spec_bundle/vite.config.js b/SpecForge-ext/docs/spec_bundle/vite.config.js new file mode 100644 index 0000000000000000000000000000000000000000..d747468c3295796728aabd7aae67de54928095c6 --- /dev/null +++ b/SpecForge-ext/docs/spec_bundle/vite.config.js @@ -0,0 +1,23 @@ +import { defineConfig } from 'vite' +import vue from '@vitejs/plugin-vue' + +// https://vite.dev/config/ +export default defineConfig({ + plugins: [vue()], + base: './', // Use relative paths for deployment + build: { + outDir: 'dist', + assetsDir: 'assets', + sourcemap: false, + minify: 'esbuild', // Use esbuild for faster minification (Vite built-in) + rollupOptions: { + output: { + manualChunks: { + 'vue-vendor': ['vue'], + 'echarts-vendor': ['echarts', 'vue-echarts'], + 'csv-vendor': ['papaparse'] + } + } + } + } +}) diff --git a/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_4_step_5000/config.json b/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_4_step_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4b678eea689465af6c679eb21e7c9c1ad9cfaf09 --- /dev/null +++ b/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_4_step_5000/config.json @@ -0,0 +1,33 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "draft_vocab_size": 32000, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_8_step_10000/config.json b/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_8_step_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4b678eea689465af6c679eb21e7c9c1ad9cfaf09 --- /dev/null +++ b/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_8_step_10000/config.json @@ -0,0 +1,33 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "draft_vocab_size": 32000, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310/config.json b/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4b678eea689465af6c679eb21e7c9c1ad9cfaf09 --- /dev/null +++ b/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310/config.json @@ -0,0 +1,33 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "draft_vocab_size": 32000, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.57.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/SpecForge-ext/tests/ci/gpu_lock_exec.py b/SpecForge-ext/tests/ci/gpu_lock_exec.py new file mode 100644 index 0000000000000000000000000000000000000000..6ca44c6b66c73ad26be2eac0505626857aec64a4 --- /dev/null +++ b/SpecForge-ext/tests/ci/gpu_lock_exec.py @@ -0,0 +1,249 @@ +import argparse +import fcntl +import os +import random +import sys +import time +from typing import List + +SLEEP_BACKOFF = 5.0 + + +def main(): + """ + Remark: Can use `lslocks` to debug + """ + args = _parse_args() + + if args.print_only: + _execute_print_only(args) + return + + fd_locks = _try_acquire(args) + + dev_list = ",".join(str(x.gpu_id) for x in fd_locks) + os.environ["CUDA_VISIBLE_DEVICES"] = dev_list + + if args.env: + for env_var in args.env: + name, value = env_var.split("=") + os.environ[name] = value + print( + f"[gpu_lock_exec] Setting environment variable: {name}={value}", + flush=True, + ) + print(f"[gpu_lock_exec] Acquired GPUs: {dev_list}", flush=True) + + _os_execvp(args) + + +def _os_execvp(args): + cmd = args.cmd + if cmd[0] == "--": + cmd = cmd[1:] + + # propagate the environment variables + os.execvp(cmd[0], cmd) + + +def _parse_args(): + p = argparse.ArgumentParser() + p.add_argument( + "--count", type=int, default=None, help="Acquire this many GPUs (any free ones)" + ) + p.add_argument( + "--devices", + type=str, + default=None, + help="Comma separated explicit devices to acquire (e.g. 0,1)", + ) + p.add_argument( + "--total-gpus", type=int, default=8, help="Total GPUs on the machine" + ) + p.add_argument( + "--timeout", + type=int, + default=3600, + help="Seconds to wait for locks before failing", + ) + p.add_argument( + "--env", + type=str, + default=None, + nargs="*", + help="Environment variables to set (e.g. HF_TOKEN=1234567890)", + ) + p.add_argument( + "--lock-path-pattern", + type=str, + default="/dev/shm/custom_gpu_lock_{gpu_id}.lock", + help='Filename pattern with "{gpu_id}" placeholder', + ) + p.add_argument( + "--print-only", + action="store_true", + help="Probe free devices and print them (does NOT hold locks)", + ) + p.add_argument( + "cmd", + nargs=argparse.REMAINDER, + help="Command to exec after '--' (required unless --print-only)", + ) + args = p.parse_args() + + if "{gpu_id}" not in args.lock_path_pattern: + raise Exception("ERROR: --lock-path-pattern must contain '{i}' placeholder.") + + if not args.cmd and not args.print_only: + raise Exception("ERROR: missing command to run. Use -- before command.") + + return args + + +def _execute_print_only(args): + free = [] + _ensure_lock_files(path_pattern=args.lock_path_pattern, total_gpus=args.total_gpus) + for i in range(args.total_gpus): + try: + fd_lock = FdLock(args.lock_path_pattern, i) + fd_lock.open() + try: + fd_lock.lock() + fcntl.flock(fd_lock.fd, fcntl.LOCK_UN) + free.append(i) + except BlockingIOError: + pass + fd_lock.close() + except Exception as e: + print( + f"Warning: Error while probing lock: {e}", file=sys.stderr, flush=True + ) + + print("Free GPUs:", ",".join(str(x) for x in free), flush=True) + + +def _try_acquire(args): + if args.devices: + devs = _parse_devices(args.devices) + return _try_acquire_specific(devs, args.lock_path_pattern, args.timeout) + else: + return _try_acquire_count( + args.count, args.total_gpus, args.lock_path_pattern, args.timeout + ) + + +def _try_acquire_specific(devs: List[int], path_pattern: str, timeout: int): + fd_locks = [] + start = time.time() + try: + _ensure_lock_files(path_pattern, max(devs) + 1) + for gpu_id in devs: + fd_lock = FdLock(path_pattern, gpu_id=gpu_id) + fd_lock.open() + while True: + try: + fd_lock.lock() + break + except BlockingIOError: + if time.time() - start > timeout: + raise TimeoutError(f"Timeout while waiting for GPU {gpu_id}") + time.sleep(SLEEP_BACKOFF * random.random()) + fd_locks.append(fd_lock) + return fd_locks + except Exception as e: + print( + f"Error during specific GPU acquisition: {e}", file=sys.stderr, flush=True + ) + for fd_lock in fd_locks: + fd_lock.close() + raise + + +def _try_acquire_count(count: int, total_gpus: int, path_pattern: str, timeout: int): + start = time.time() + _ensure_lock_files(path_pattern, total_gpus) + while True: + fd_locks: List = [] + for gpu_id in range(total_gpus): + fd_lock = FdLock(path_pattern, gpu_id=gpu_id) + fd_lock.open() + try: + fd_lock.lock() + except BlockingIOError: + fd_lock.close() + continue + + fd_locks.append(fd_lock) + if len(fd_locks) == count: + return fd_locks + + gotten_gpu_ids = [x.gpu_id for x in fd_locks] + for fd_lock in fd_locks: + fd_lock.close() + del fd_lock + + if time.time() - start > timeout: + raise TimeoutError(f"Timeout acquiring {count} GPUs (out of {total_gpus})") + + print( + f"[gpu_lock_exec] try_acquire_count failed, sleep and retry (only got: {gotten_gpu_ids})", + flush=True, + ) + time.sleep(SLEEP_BACKOFF * random.random()) + + +class FdLock: + def __init__(self, path_pattern, gpu_id: int): + self.gpu_id = gpu_id + self.path = _get_lock_path(path_pattern, self.gpu_id) + self.fd = None + + def open(self): + assert self.fd is None + self.fd = open(self.path, "a+") + # try to avoid lock disappear when execvp + os.set_inheritable(self.fd.fileno(), True) + + def lock(self): + assert self.fd is not None + fcntl.flock(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + + def close(self): + assert self.fd is not None + try: + self.fd.close() + except Exception as e: + print( + f"Warning: Failed to close file descriptor: {e}", + file=sys.stderr, + flush=True, + ) + self.fd = None + + +def _ensure_lock_files(path_pattern: str, total_gpus: int): + lock_dir = os.path.dirname(path_pattern) + if lock_dir: + os.makedirs(lock_dir, exist_ok=True) + for gpu_id in range(total_gpus): + p = _get_lock_path(path_pattern, gpu_id) + try: + open(p, "a").close() + except Exception as e: + print( + f"Warning: Could not create lock file {p}: {e}", + file=sys.stderr, + flush=True, + ) + + +def _get_lock_path(path_pattern: str, gpu_id: int) -> str: + return path_pattern.format(gpu_id=gpu_id) + + +def _parse_devices(s: str) -> List[int]: + return [int(x) for x in s.split(",") if x.strip() != ""] + + +if __name__ == "__main__": + main() diff --git a/SpecForge-ext/tests/test_data/__init__.py b/SpecForge-ext/tests/test_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_data/test_parsers.py b/SpecForge-ext/tests/test_data/test_parsers.py new file mode 100644 index 0000000000000000000000000000000000000000..064e1587bb9f944649c91fac55b6268277cbb90d --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_parsers.py @@ -0,0 +1,204 @@ +import json +import os +import unittest +from typing import Any, Dict, List, Optional + +from transformers import AutoTokenizer + +from specforge.data.preprocessing import preprocess_conversations +from specforge.data.template import TEMPLATE_REGISTRY + + +class TestTemplatePreprocessing(unittest.TestCase): + # Configuration section + SAVE_REFERENCE = False + REF_DIR = os.path.join(os.path.dirname(__file__), "test_references") + + @classmethod + def setUpClass(cls): + """Initialize standard test data""" + cls.max_length = 65535 + if not os.path.exists(cls.REF_DIR): + os.makedirs(cls.REF_DIR) + + # 1. General model test data (Qwen, DeepSeek, etc.) + cls.standard_messages = [ + [ + {"role": "user", "content": "Who are you?"}, + {"role": "assistant", "content": "My name is Qwen2."}, + {"role": "user", "content": "How old are you?"}, + {"role": "assistant", "content": "11 years old."}, + ] + ] + + # 2. GPT-OSS Dedicated Test Data (Including Analysis and Final Channel) + cls.gpt_oss_messages = [ + [ + {"role": "user", "content": "Explain Quantum Physics."}, + { + "role": "assistant_analysis", + "content": "The user wants a summary of quantum physics. I should cover wave-particle duality and uncertainty principle.", + }, + { + "role": "assistant_final", + "content": "Quantum physics is the study of matter and energy at the most fundamental level...", + }, + {"role": "user", "content": "Explain Quantum Physics."}, + {"role": "assistant_final", "content": "I'm Qwen"}, + ] + ] + + # 3. Tool-Use Test Data + cls.tool_use_messages = [ + [ + { + "role": "user", + "content": "What's the weather like in Beijing today?", + }, + { + "role": "assistant", + "content": "I'll check the current weather in Beijing for you.", + }, + { + "role": "tool", + "content": '{"location": "Beijing", "temperature": 22, "condition": "Sunny"}', + }, + { + "role": "assistant", + "content": "The current weather in Beijing is sunny with a temperature of 22°C.", + }, + { + "role": "tool", + "content": '{"unit": "Celsius", "forecast": "Clear skies all day."}', + }, + { + "role": "tool", + "content": '{"unit": "Celsius", "forecast": "Clear skies all day."}', + }, + { + "role": "user", + "content": "Great! Can you also tell me if it will rain tomorrow?", + }, + { + "role": "assistant", + "content": "Based on the forecast, there will be no rain tomorrow—expect clear skies all day.", + }, + ] + ] + + def _get_ref_path(self, template_key: str, message_label: str = "standard"): + return os.path.join(self.REF_DIR, f"{template_key}_{message_label}_ref.json") + + def _run_template_test( + self, + model_name: str, + template_key: str, + messages: Optional[List[List[Dict[str, str]]]] = None, + ): + """Encapsulate common test and regression validation logic""" + + # Use the input message or the default standard message. + target_messages = messages if messages is not None else self.standard_messages + message_label = None + if target_messages == self.standard_messages: + message_label = "standard" + elif target_messages == self.gpt_oss_messages: + message_label = "gpt-oss" + elif target_messages == self.tool_use_messages: + message_label = "tool-use" + else: + raise ValueError("Invalid message set") + print(f"\n>>> Running: {template_key} ({model_name}) {message_label}") + + # 1. Initialize tokenizer and template + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + chat_template = TEMPLATE_REGISTRY.get(template_key) + + # 2. Preprocess conversations + res = preprocess_conversations( + tokenizer, target_messages, chat_template, self.max_length + ) + # Extract current result + current_data = { + "input_ids": res["input_ids"][0][0].tolist(), + "loss_mask": res["loss_mask"][0][0].tolist(), + } + + ref_path = self._get_ref_path(template_key, message_label) + # 3. Branch logic: update reference or perform comparison + if self.SAVE_REFERENCE: + with open(ref_path, "w", encoding="utf-8") as f: + json.dump(current_data, f) + print(f" [INFO] Reference saved to {ref_path}") + else: + if not os.path.exists(ref_path): + self.fail( + f"Reference file not found for {template_key}. Set SAVE_REFERENCE=True." + ) + + with open(ref_path, "r", encoding="utf-8") as f: + ref_data = json.load(f) + + self.assertListEqual(current_data["input_ids"], ref_data["input_ids"]) + self.assertListEqual(current_data["loss_mask"], ref_data["loss_mask"]) + print(f" [PASS] Regression test passed for {template_key}") + + # 4. Debug output + self.debug_show_loss_mask(res, tokenizer) + + @staticmethod + def debug_show_loss_mask(res: Dict[str, Any], tokenizer: AutoTokenizer): + input_ids = res["input_ids"][0][0].tolist() + loss_mask = res["loss_mask"][0][0].tolist() + RED, RESET = "\033[91m", "\033[0m" + print("-" * 30) + for tid, m in zip(input_ids, loss_mask): + txt = tokenizer.decode([tid]) + txt = txt.replace("\n", "\\n") + print(f"{RED if m == 1 else ''}{txt}{RESET}", end="") + print("\n" + "-" * 30) + + ## The Following are tests. Each test corresponds to a specific model and template. + + def test_deepseek(self): + self._run_template_test("deepseek-ai/DeepSeek-V3", "deepseek-v3") + + def test_deepseek_v32(self): + self._run_template_test("deepseek-ai/DeepSeek-V3.2", "deepseek-v32") + + def test_qwen3_thinking(self): + self._run_template_test("Qwen/Qwen3-0.6B", "qwen3-thinking") + + def test_qwen3_instruct(self): + self._run_template_test("Qwen/Qwen3-0.6B", "qwen3-instruct") + + def test_qwen3_next_instruct(self): + self._run_template_test("Qwen/Qwen3-Next-80B-A3B-Instruct", "qwen") + + def test_kimi_k2_thinking(self): + self._run_template_test("moonshotai/Kimi-K2-Thinking", "kimi-k2-thinking") + + def test_kimi_k2_instruct(self): + self._run_template_test("moonshotai/Kimi-K2-Instruct", "kimi-k2-instruct") + + def test_qwen3_next_thinking(self): + self._run_template_test( + "Qwen/Qwen3-Next-80B-A3B-Thinking", "qwen3-next-thinking" + ) + + def test_gpt_oss(self): + self._run_template_test( + "openai/gpt-oss-120b", "gpt-oss", messages=self.gpt_oss_messages + ) + + def test_ling_flash_2_0(self): + self._run_template_test("inclusionAI/Ling-flash-2.0", "ling-flash-2.0") + + def test_qwen3_instruct_with_tools(self): + self._run_template_test( + "Qwen/Qwen3-0.6B", "qwen3-instruct", messages=self.tool_use_messages + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/SpecForge-ext/tests/test_data/test_preprocessing.py b/SpecForge-ext/tests/test_data/test_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..5301aaf9bd147b63f00b76148ac0009865f7f916 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_preprocessing.py @@ -0,0 +1,354 @@ +import unittest + +import torch +from transformers import AutoTokenizer + +from specforge.data.preprocessing import preprocess_conversations +from specforge.data.template import TEMPLATE_REGISTRY + + +# Utility function for visual debugging +def visualize_loss_mask(tokenizer, input_ids, loss_mask): + """Utility function to visualize which tokens contribute to loss.""" + RED = "\033[91m" # Non-assistant tokens (loss_mask = 0) + GREEN = "\033[92m" # Assistant tokens (loss_mask = 1) + RESET = "\033[0m" + + print("\nLoss Mask Visualization:") + print("RED = Non-assistant tokens (loss_mask = 0)") + print("GREEN = Assistant tokens (loss_mask = 1)") + print("-" * 50) + + # Handle both 1D and 2D tensors - flatten if needed + if input_ids.dim() > 1: + input_ids = input_ids.flatten() + if loss_mask.dim() > 1: + loss_mask = loss_mask.flatten() + + if len(input_ids) == 0 or len(loss_mask) == 0: + print("Empty input") + return + + current_mask = loss_mask[0].item() + current_ids = [] + + for i in range(len(input_ids)): + if current_mask == loss_mask[i].item(): + current_ids.append(input_ids[i].item()) + else: + if hasattr(tokenizer, "decode"): + decoded_text = tokenizer.decode(current_ids, skip_special_tokens=False) + else: + decoded_text = " ".join([f"token_{id}" for id in current_ids]) + if current_mask == 0: + print(f"{RED}{decoded_text}{RESET}", end="") + else: + print(f"{GREEN}{decoded_text}{RESET}", end="") + current_ids = [input_ids[i].item()] + current_mask = loss_mask[i].item() + + # Print remaining tokens + if current_ids: + if hasattr(tokenizer, "decode"): + decoded_text = tokenizer.decode(current_ids, skip_special_tokens=False) + else: + decoded_text = " ".join([f"token_{id}" for id in current_ids]) + if current_mask == 0: + print(f"{RED}{decoded_text}{RESET}") + else: + print(f"{GREEN}{decoded_text}{RESET}") + print("\n" + "-" * 50) + + +class TestPreprocessing(unittest.TestCase): + """Test suite for conversation preprocessing and loss mask generation.""" + + def setUp(self): + """Set up test fixtures with Qwen3-8B tokenizer and template.""" + self.model_path = "Qwen/Qwen3-8B" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.chat_template = TEMPLATE_REGISTRY.get("qwen") + self.max_length = 512 + + def test_conversation_preprocessing_basic(self): + """Test basic conversation preprocessing with assistant response identification.""" + conversations = [ + [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "The answer is 4."}, + ] + ] + + results = preprocess_conversations( + tokenizer=self.tokenizer, + conversations=conversations, + chat_template=self.chat_template, + max_length=self.max_length, + is_preformatted=False, + ) + + # Check structure + self.assertIn("input_ids", results) + self.assertIn("loss_mask", results) + self.assertIn("attention_mask", results) + self.assertEqual(len(results["input_ids"]), 1) + self.assertEqual(len(results["loss_mask"]), 1) + self.assertEqual(len(results["attention_mask"]), 1) + + # Verify tensor shapes match + input_ids = results["input_ids"][0].squeeze() + loss_mask = results["loss_mask"][0].squeeze() + attention_mask = results["attention_mask"][0].squeeze() + + self.assertEqual(input_ids.shape, loss_mask.shape) + self.assertEqual(input_ids.shape, attention_mask.shape) + + # Check that some tokens are marked for loss (assistant response) + self.assertTrue( + torch.any(loss_mask == 1), "No tokens marked for loss computation" + ) + + # Check that some tokens are not marked for loss (system/user parts) + self.assertTrue( + torch.any(loss_mask == 0), "All tokens marked for loss computation" + ) + + # Verify the complete assistant response is captured in the loss mask + assistant_token_indices = torch.where(loss_mask == 1)[0] + if len(assistant_token_indices) > 0: + assistant_tokens = input_ids[assistant_token_indices] + assistant_text = self.tokenizer.decode( + assistant_tokens, skip_special_tokens=False + ) + expected_assistant_text = ( + "\n\n\n\nThe answer is 4.<|im_end|>\n" + ) + self.assertEqual( + assistant_text, + expected_assistant_text, + f"Assistant text does not match exactly. Expected: {repr(expected_assistant_text)}, Got: {repr(assistant_text)}", + ) + + def test_multiple_turns_conversation(self): + """Test conversation with multiple user-assistant turns.""" + conversations = [ + [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "The answer is 4."}, + {"role": "user", "content": "Are you sure?"}, + {"role": "assistant", "content": "Yes, I'm certain."}, + ] + ] + + results = preprocess_conversations( + tokenizer=self.tokenizer, + conversations=conversations, + chat_template=self.chat_template, + max_length=self.max_length, + is_preformatted=False, + ) + + input_ids = results["input_ids"][0].squeeze() + loss_mask = results["loss_mask"][0].squeeze() + + # Get all assistant response tokens + assistant_token_indices = torch.where(loss_mask == 1)[0] + self.assertTrue( + len(assistant_token_indices) > 0, "No assistant tokens identified" + ) + + # Decode assistant tokens to verify both responses are captured + assistant_tokens = input_ids[assistant_token_indices] + assistant_text = self.tokenizer.decode( + assistant_tokens, skip_special_tokens=False + ) + + # Exact match for the complete assistant text from both turns + expected_assistant_text = "The answer is 4.<|im_end|>\n\n\n\n\nYes, I'm certain.<|im_end|>\n" + self.assertEqual( + assistant_text, + expected_assistant_text, + f"Assistant text does not match exactly. Expected: {repr(expected_assistant_text)}, Got: {repr(assistant_text)}", + ) + + def test_preformatted_conversation(self): + """Test preprocessing of pre-formatted conversation strings.""" + preformatted_conversations = [ + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is Python?<|im_end|>\n<|im_start|>assistant\nPython is a programming language.<|im_end|>\n" + ] + + results = preprocess_conversations( + tokenizer=self.tokenizer, + conversations=preformatted_conversations, + chat_template=self.chat_template, + max_length=self.max_length, + is_preformatted=True, + ) + + # Check basic structure + self.assertEqual(len(results["input_ids"]), 1) + + input_ids = results["input_ids"][0].squeeze() + loss_mask = results["loss_mask"][0].squeeze() + + # Verify assistant response is identified + self.assertTrue( + torch.any(loss_mask == 1), + "No assistant tokens marked in preformatted input", + ) + + # Extract and verify assistant content + assistant_token_indices = torch.where(loss_mask == 1)[0] + assistant_tokens = input_ids[assistant_token_indices] + assistant_text = self.tokenizer.decode( + assistant_tokens, skip_special_tokens=False + ) + + # Check for exact match of the expected assistant response + expected_assistant_text = "Python is a programming language.<|im_end|>\n" + self.assertEqual( + assistant_text, + expected_assistant_text, + f"Assistant text does not match exactly. Expected: {repr(expected_assistant_text)}, Got: {repr(assistant_text)}", + ) + + def test_assistant_span_boundaries(self): + """Test that assistant span boundaries are correctly identified without truncation.""" + test_cases = [ + { + "name": "Short response", + "conversation": [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ], + "expected_assistant_text": "\n\n\n\nHello!<|im_end|>\n", + }, + { + "name": "Response with punctuation", + "conversation": [ + {"role": "user", "content": "What's your name?"}, + {"role": "assistant", "content": "I'm Claude, an AI assistant."}, + ], + "expected_assistant_text": "\n\n\n\nI'm Claude, an AI assistant.<|im_end|>\n", + }, + { + "name": "Multi-sentence response", + "conversation": [ + {"role": "user", "content": "Tell me about Python."}, + { + "role": "assistant", + "content": "Python is a programming language. It's very popular for AI.", + }, + ], + "expected_assistant_text": "\n\n\n\nPython is a programming language. It's very popular for AI.<|im_end|>\n", + }, + { + "name": "Response with special characters", + "conversation": [ + {"role": "user", "content": "Show me math."}, + { + "role": "assistant", + "content": "Sure! Here's an example: 2 + 2 = 4, and π ≈ 3.14159.", + }, + ], + "expected_assistant_text": "\n\n\n\nSure! Here's an example: 2 + 2 = 4, and π ≈ 3.14159.<|im_end|>\n", + }, + ] + + for test_case in test_cases: + with self.subTest(test_case["name"]): + conversations = [test_case["conversation"]] + + results = preprocess_conversations( + tokenizer=self.tokenizer, + conversations=conversations, + chat_template=self.chat_template, + max_length=self.max_length, + is_preformatted=False, + ) + + input_ids = results["input_ids"][0].squeeze() + loss_mask = results["loss_mask"][0].squeeze() + + # Extract assistant tokens + assistant_token_indices = torch.where(loss_mask == 1)[0] + self.assertTrue( + len(assistant_token_indices) > 0, + f"No assistant tokens found for test case: {test_case['name']}", + ) + + assistant_tokens = input_ids[assistant_token_indices] + assistant_text = self.tokenizer.decode( + assistant_tokens, skip_special_tokens=False + ) + + # Verify exact match of the expected assistant text + expected_assistant_text = test_case["expected_assistant_text"] + self.assertEqual( + assistant_text, + expected_assistant_text, + f"Assistant text does not match exactly for test case '{test_case['name']}'. Expected: {repr(expected_assistant_text)}, Got: {repr(assistant_text)}", + ) + + # Additional check: ensure no user content leaked into assistant spans + user_content = test_case["conversation"][0]["content"] + # Check if user content appears in assistant text (should not happen with exact matching) + self.assertNotIn( + user_content, + assistant_text, + f"User content '{user_content}' found in assistant spans for test case '{test_case['name']}': '{assistant_text}'", + ) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + + suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestPreprocessing)) + + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) + + # Commented-out example for using visualize_loss_mask function directly + """ + # Example usage of visualize_loss_mask for debugging/visualization + model_path = "Qwen/Qwen3-8B" + tokenizer = AutoTokenizer.from_pretrained(model_path) + chat_template = TEMPLATE_REGISTRY.get("qwen") + + # Using conversations list + conversations = [ + [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "The answer is 4."}, + {"role": "user", "content": "I don't think that's right"}, + {"role": "assistant", "content": "I'm pretty sure it's 4."}, + ], + [ + {"role": "user", "content": "How do you boil water?"}, + {"role": "assistant", "content": "Use a stove."}, + ], + ] + results = preprocess_conversations( + tokenizer=tokenizer, + conversations=conversations, + chat_template=chat_template, + max_length=512, + is_preformatted=False, + ) + for i in range(len(results["input_ids"])): + visualize_loss_mask(tokenizer, results["input_ids"][i], results["loss_mask"][i]) + + # Using preformatted conversation + preformatted_conversations = [ + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\nThe answer is 4.<|im_end|>\n<|im_start|>user\nI don't think that's right<|im_end|>\n<|im_start|>assistant\n\nI know 2+2 is 4\n\nI'm pretty sure it's 4.<|im_end|>\n", + ] + results = preprocess_conversations( + tokenizer=tokenizer, + conversations=preformatted_conversations, + chat_template=chat_template, + max_length=512, + is_preformatted=True, + ) + for i in range(len(results["input_ids"])): + visualize_loss_mask(tokenizer, results["input_ids"][i], results["loss_mask"][i]) + """ diff --git a/SpecForge-ext/tests/test_data/test_references/deepseek-v32_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/deepseek-v32_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..3aadda5a94d02c25626d394173a7d587400c57b7 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/deepseek-v32_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [0, 128803, 18387, 477, 440, 33, 128804, 6759, 2329, 344, 1646, 19566, 20, 16, 1, 128803, 4117, 3072, 477, 440, 33, 128804, 779, 1737, 3072, 16, 1], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/deepseek-v3_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/deepseek-v3_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..1d759a83637eba8dab677fbe4773ed2277519076 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/deepseek-v3_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [0, 3476, 477, 260, 11502, 22896, 16, 128803, 18387, 477, 440, 33, 128804, 6759, 2329, 344, 1646, 19566, 20, 16, 1, 128803, 4117, 3072, 477, 440, 33, 128804, 779, 1737, 3072, 16, 1], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/gpt-oss_gpt-oss_ref.json b/SpecForge-ext/tests/test_data/test_references/gpt-oss_gpt-oss_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..88344bf001ebb7891ab9b437d4683fd913cadfbf --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/gpt-oss_gpt-oss_ref.json @@ -0,0 +1 @@ +{"input_ids": [200006, 17360, 200008, 3575, 553, 17554, 162016, 11, 261, 4410, 6439, 2359, 22203, 656, 7788, 17527, 558, 87447, 100594, 25, 220, 1323, 19, 12, 3218, 198, 6576, 3521, 25, 220, 1323, 20, 12, 3218, 12, 2029, 279, 30377, 289, 25, 4465, 279, 2, 13888, 18403, 25, 8450, 11, 49159, 11, 1721, 13, 21030, 2804, 413, 7360, 395, 1753, 3176, 13, 200007, 200006, 1428, 200008, 176289, 90765, 48711, 13, 200007, 200006, 173781, 200005, 35644, 200008, 976, 1825, 10648, 261, 18522, 328, 48889, 35438, 13, 357, 1757, 4321, 20485, 3161, 12608, 25399, 536, 326, 44942, 30540, 13, 200007, 200006, 173781, 200005, 17196, 200008, 170939, 35438, 382, 290, 5012, 328, 7165, 326, 5954, 540, 290, 1645, 18864, 3211, 1008, 200007, 200006, 1428, 200008, 176289, 90765, 48711, 13, 200007, 200006, 173781, 200005, 17196, 200008, 15390, 1486, 11027, 200007], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/kimi-k2-instruct_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/kimi-k2-instruct_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..932853d41e351337e961aff93256580cc95ae0a9 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/kimi-k2-instruct_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [163594, 14062, 163601, 3900, 554, 261, 13205, 26626, 13, 163586, 163587, 2482, 163601, 24328, 554, 398, 30, 163586, 163588, 69702, 163601, 6725, 1530, 387, 1999, 33249, 17, 13, 163586, 163587, 2482, 163601, 6034, 3410, 554, 398, 30, 163586, 163588, 69702, 163601, 1228, 2285, 3410, 13, 163586], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/kimi-k2-thinking_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/kimi-k2-thinking_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..dc35abd43f6cc3f265b3b8f1d82fc5eb506565c7 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/kimi-k2-thinking_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [163594, 14062, 163601, 3900, 554, 261, 13205, 26626, 13, 163586, 163587, 2482, 163601, 24328, 554, 398, 30, 163586, 163588, 69702, 163601, 163606, 163607, 6725, 1530, 387, 1999, 33249, 17, 13, 163586, 163587, 2482, 163601, 6034, 3410, 554, 398, 30, 163586, 163588, 69702, 163601, 1228, 2285, 3410, 13, 163586], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/ling-flash-2.0_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/ling-flash-2.0_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..b0b4a4e1988ec0666795f1cc113436bb97fb019e --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/ling-flash-2.0_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [157151, 90827, 157152, 2496, 449, 259, 9031, 16841, 13, 198, 14136, 5381, 6350, 928, 156895, 157151, 39, 116171, 157152, 13328, 449, 362, 30, 156895, 157151, 8469, 7342, 5468, 157152, 4653, 1717, 341, 1834, 36364, 17, 13, 156895, 157151, 39, 116171, 157152, 3115, 2622, 449, 362, 30, 156895, 157151, 8469, 7342, 5468, 157152, 16, 16, 1594, 2622, 13, 156895], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/qwen3-instruct_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/qwen3-instruct_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..2d8de2ea90f5ada05e36e9fbc71304ebab64e6e0 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/qwen3-instruct_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 15191, 525, 498, 30, 151645, 198, 151644, 77091, 198, 5050, 829, 374, 1207, 16948, 17, 13, 151645, 198, 151644, 872, 198, 4340, 2310, 525, 498, 30, 151645, 198, 151644, 77091, 198, 151667, 271, 151668, 271, 16, 16, 1635, 2310, 13, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/qwen3-instruct_tool-use_ref.json b/SpecForge-ext/tests/test_data/test_references/qwen3-instruct_tool-use_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..6272088047096d71cc59071caf691b448c613e7e --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/qwen3-instruct_tool-use_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 3838, 594, 279, 9104, 1075, 304, 26549, 3351, 30, 151645, 198, 151644, 77091, 198, 40, 3278, 1779, 279, 1482, 9104, 304, 26549, 369, 498, 13, 151645, 198, 151644, 872, 198, 151665, 198, 4913, 2527, 788, 330, 3430, 23649, 497, 330, 34558, 788, 220, 17, 17, 11, 330, 9056, 788, 330, 50, 27297, 16707, 151666, 151645, 198, 151644, 77091, 198, 785, 1482, 9104, 304, 26549, 374, 39698, 448, 264, 9315, 315, 220, 17, 17, 30937, 13, 151645, 198, 151644, 872, 198, 151665, 198, 4913, 3843, 788, 330, 34, 40247, 497, 330, 58984, 788, 330, 14008, 49293, 678, 1899, 1189, 532, 151666, 198, 151665, 198, 4913, 3843, 788, 330, 34, 40247, 497, 330, 58984, 788, 330, 14008, 49293, 678, 1899, 1189, 532, 151666, 151645, 198, 151644, 872, 198, 21396, 0, 2980, 498, 1083, 3291, 752, 421, 432, 686, 11174, 16577, 30, 151645, 198, 151644, 77091, 198, 151667, 271, 151668, 271, 28715, 389, 279, 17595, 11, 1052, 686, 387, 902, 11174, 16577, 2293, 17119, 2797, 49293, 678, 1899, 13, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/qwen3-next-thinking_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/qwen3-next-thinking_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..fe12355606c8c63f489913753e215863f4fba1ce --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/qwen3-next-thinking_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 15191, 525, 498, 30, 151645, 198, 151644, 77091, 198, 5050, 829, 374, 1207, 16948, 17, 13, 151645, 198, 151644, 872, 198, 4340, 2310, 525, 498, 30, 151645, 198, 151644, 77091, 198, 151667, 198, 16, 16, 1635, 2310, 13, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/qwen3-thinking_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/qwen3-thinking_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..61885ced3006f2cdc6334656511caf373361bb0a --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/qwen3-thinking_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 15191, 525, 498, 30, 151645, 198, 151644, 77091, 198, 5050, 829, 374, 1207, 16948, 17, 13, 151645, 198, 151644, 872, 198, 4340, 2310, 525, 498, 30, 151645, 198, 151644, 77091, 198, 16, 16, 1635, 2310, 13, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/qwen_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/qwen_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..61885ced3006f2cdc6334656511caf373361bb0a --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/qwen_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 15191, 525, 498, 30, 151645, 198, 151644, 77091, 198, 5050, 829, 374, 1207, 16948, 17, 13, 151645, 198, 151644, 872, 198, 4340, 2310, 525, 498, 30, 151645, 198, 151644, 77091, 198, 16, 16, 1635, 2310, 13, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/qwen_tool-use_ref.json b/SpecForge-ext/tests/test_data/test_references/qwen_tool-use_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..b9b96e47476f270d9bbcfc42f93b40b5ec46aebe --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/qwen_tool-use_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 646, 2711, 279, 3482, 323, 1779, 279, 9104, 13, 151645, 198, 151644, 872, 198, 3838, 594, 279, 9104, 1075, 304, 26549, 3351, 30, 151645, 198, 151644, 77091, 198, 40, 3278, 1779, 279, 1482, 9104, 304, 26549, 369, 498, 13, 151645, 198, 151644, 872, 198, 151665, 198, 4913, 2527, 788, 330, 3430, 23649, 497, 330, 34558, 788, 220, 17, 17, 11, 330, 9056, 788, 330, 50, 27297, 16707, 151666, 151645, 198, 151644, 77091, 198, 785, 1482, 9104, 304, 26549, 374, 39698, 448, 264, 9315, 315, 220, 17, 17, 30937, 13, 151645, 198, 151644, 872, 198, 151665, 198, 4913, 3843, 788, 330, 34, 40247, 497, 330, 58984, 788, 330, 14008, 49293, 678, 1899, 1189, 532, 151666, 198, 151665, 198, 4913, 3843, 788, 330, 34, 40247, 497, 330, 58984, 788, 330, 14008, 49293, 678, 1899, 1189, 532, 151666, 151645, 198, 151644, 872, 198, 21396, 0, 2980, 498, 1083, 3291, 752, 421, 432, 686, 11174, 16577, 30, 151645, 198, 151644, 77091, 198, 151667, 271, 151668, 271, 28715, 389, 279, 17595, 11, 1052, 686, 387, 902, 11174, 16577, 2293, 17119, 2797, 49293, 678, 1899, 13, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_data/test_references/repo-wiki_standard_ref.json b/SpecForge-ext/tests/test_data/test_references/repo-wiki_standard_ref.json new file mode 100644 index 0000000000000000000000000000000000000000..a12f427fe74fbd1f9afb08e3b9bcbc7c7feeed85 --- /dev/null +++ b/SpecForge-ext/tests/test_data/test_references/repo-wiki_standard_ref.json @@ -0,0 +1 @@ +{"input_ids": [151644, 872, 271, 334, 98743, 25, 1446, 27732, 5889, 304, 8453, 7, 104811, 8, 659, 56177, 27, 5778, 397, 2610, 525, 458, 20685, 10916, 9705, 11470, 448, 5538, 18726, 304, 3162, 4401, 11, 1849, 2884, 11, 323, 15754, 3139, 25262, 13, 4615, 35874, 15448, 304, 41018, 6351, 2038, 78267, 323, 6825, 9705, 14389, 429, 5165, 36967, 10916, 6540, 1119, 41679, 11, 91078, 6832, 42914, 13, 1446, 3535, 429, 24364, 9705, 17045, 438, 279, 14164, 1948, 2038, 23094, 323, 15754, 25148, 624, 522, 5778, 1339, 27, 8202, 8467, 397, 7771, 8954, 374, 311, 23643, 279, 3897, 12542, 323, 6923, 264, 15817, 9705, 6220, 5944, 429, 17045, 438, 279, 16266, 369, 264, 1879, 14800, 9705, 3910, 13, 1096, 5944, 1969, 27968, 311, 13402, 3941, 678, 3139, 5866, 11, 504, 67458, 10887, 3974, 389, 37569, 311, 11647, 22703, 11682, 5785, 7236, 382, 334, 10234, 419, 12850, 95518, 8325, 12, 51143, 9705, 27957, 25271, 15754, 389, 37569, 882, 11, 42054, 1824, 22305, 11, 323, 14177, 973, 4565, 24376, 13, 4615, 6358, 686, 8253, 1246, 13444, 7263, 646, 3535, 11, 4211, 11, 323, 13036, 419, 2038, 3152, 624, 522, 8202, 8467, 1339, 5338, 11, 1401, 1119, 279, 2701, 1995, 911, 279, 12542, 498, 1184, 311, 975, 389, 1447, 4624, 1034, 6220, 4916, 510, 27, 4987, 38283, 397, 624, 144663, 16991, 8315, 198, 262, 80493, 8315, 2972, 198, 286, 22612, 242, 16991, 2943, 18002, 198, 262, 80493, 8315, 4030, 198, 286, 80493, 3538, 18002, 198, 286, 22612, 242, 16991, 3538, 4452, 18002, 198, 262, 80493, 5439, 198, 286, 80493, 5439, 18002, 198, 286, 22612, 242, 16991, 2193, 18002, 198, 262, 22612, 242, 16991, 1887, 18002, 198, 144663, 16991, 1936, 21492, 198, 262, 80493, 5439, 198, 286, 80493, 5439, 18002, 198, 286, 22612, 242, 16991, 2193, 18002, 198, 262, 80493, 4772, 2972, 198, 286, 80493, 2943, 18002, 198, 286, 80493, 9109, 18002, 198, 286, 22612, 242, 16991, 7497, 18002, 198, 262, 80493, 4772, 6507, 198, 286, 22612, 242, 16991, 4119, 18002, 198, 262, 80493, 4772, 4030, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 3538, 18002, 198, 286, 22612, 242, 16991, 3538, 4452, 18002, 198, 262, 80493, 4772, 4314, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 3553, 18002, 198, 286, 22612, 242, 16991, 3553, 4452, 18002, 198, 262, 80493, 4772, 1313, 198, 286, 80493, 1638, 91578, 18002, 198, 286, 80493, 26588, 91578, 18002, 198, 286, 80493, 2415, 18002, 198, 286, 22612, 242, 16991, 2415, 4452, 18002, 198, 262, 22612, 242, 16991, 1887, 18002, 198, 144663, 16991, 2193, 198, 262, 80493, 8315, 198, 286, 22612, 242, 16991, 2331, 33406, 198, 262, 80493, 1936, 21492, 198, 286, 22612, 242, 16991, 2331, 33406, 198, 262, 80493, 6238, 198, 286, 22612, 242, 16991, 2331, 33406, 198, 262, 80493, 13291, 198, 286, 22612, 242, 16991, 2331, 33406, 198, 262, 22612, 242, 16991, 28331, 198, 286, 22612, 242, 16991, 2331, 33406, 198, 144663, 16991, 6200, 198, 262, 80493, 23404, 2733, 18002, 198, 262, 80493, 20882, 18002, 198, 262, 80493, 20882, 4452, 18002, 198, 262, 80493, 4078, 5191, 18002, 198, 262, 80493, 4078, 5191, 4452, 18002, 198, 262, 80493, 37664, 18002, 198, 262, 80493, 3546, 8296, 18002, 198, 262, 80493, 3546, 8296, 4452, 18002, 198, 262, 80493, 2270, 466, 824, 18002, 198, 262, 80493, 2270, 466, 824, 4452, 18002, 198, 262, 80493, 14397, 8467, 18002, 198, 262, 80493, 14397, 8467, 4452, 18002, 198, 262, 80493, 14397, 842, 18002, 198, 262, 80493, 14397, 842, 4452, 18002, 198, 262, 80493, 14397, 3109, 18002, 198, 262, 80493, 14397, 3109, 4452, 18002, 198, 262, 22612, 242, 16991, 6573, 8950, 18002, 198, 144663, 16991, 26588, 198, 262, 80493, 8315, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 80493, 1936, 21492, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 80493, 58113, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 80493, 6238, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 80493, 13291, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 80493, 1273, 3848, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 80493, 28331, 198, 286, 22612, 242, 16991, 40549, 1192, 198, 262, 22612, 242, 16991, 6505, 62, 73561, 2395, 198, 144663, 16991, 10295, 198, 262, 80493, 3483, 18855, 198, 286, 80493, 2193, 198, 310, 80493, 8315, 198, 394, 22612, 242, 16991, 4401, 33406, 198, 310, 80493, 1936, 21492, 198, 394, 22612, 242, 16991, 4401, 33406, 198, 310, 80493, 6238, 198, 394, 22612, 242, 16991, 4401, 33406, 198, 310, 80493, 13291, 198, 394, 22612, 242, 16991, 4401, 33406, 198, 310, 22612, 242, 16991, 28331, 198, 394, 22612, 242, 16991, 4401, 33406, 198, 286, 80493, 61945, 21324, 198, 286, 80493, 8315, 11667, 4090, 2395, 198, 286, 80493, 8315, 11667, 4906, 15847, 2395, 198, 286, 80493, 8315, 23241, 4090, 2395, 198, 286, 80493, 8315, 23241, 4906, 15847, 2395, 198, 286, 80493, 58113, 4090, 2395, 198, 286, 80493, 58113, 4906, 15847, 2395, 198, 286, 22612, 242, 16991, 58113, 4906, 80143, 2395, 198, 262, 22612, 242, 16991, 595, 23, 82, 198, 286, 80493, 61945, 21324, 198, 286, 22612, 242, 16991, 16661, 4323, 198, 144663, 16991, 33765, 198, 262, 80493, 2193, 198, 286, 80493, 8315, 33406, 198, 286, 80493, 1936, 21492, 33406, 198, 286, 80493, 6238, 33406, 198, 286, 80493, 13291, 33406, 198, 286, 22612, 242, 16991, 28331, 33406, 198, 262, 80493, 19911, 198, 286, 80493, 13009, 33406, 198, 286, 80493, 1936, 21492, 33406, 198, 286, 80493, 2193, 33406, 198, 286, 80493, 32372, 33406, 198, 286, 80493, 13291, 33406, 198, 286, 80493, 1273, 3848, 33406, 198, 286, 22612, 242, 16991, 90982, 33406, 198, 262, 80493, 21266, 33406, 198, 262, 22612, 242, 16991, 2750, 33406, 198, 144663, 16991, 3051, 198, 262, 80493, 19163, 198, 286, 80493, 19163, 7650, 198, 310, 22612, 242, 16991, 5975, 18002, 198, 286, 80493, 342, 4837, 20942, 198, 310, 80493, 2943, 18002, 198, 310, 80493, 2943, 4452, 18002, 198, 310, 80493, 2193, 18002, 198, 310, 22612, 242, 16991, 342, 4837, 18002, 198, 286, 80493, 305, 34378, 20942, 198, 310, 80493, 3482, 71, 34378, 198, 394, 80493, 2943, 18002, 198, 394, 80493, 2943, 4452, 18002, 198, 394, 80493, 2193, 18002, 198, 394, 22612, 242, 16991, 2951, 18002, 198, 310, 80493, 2943, 18002, 198, 310, 80493, 2943, 4452, 18002, 198, 310, 22612, 242, 16991, 2193, 18002, 198, 286, 80493, 1758, 20942, 198, 310, 80493, 1758, 18002, 198, 310, 22612, 242, 16991, 1758, 4452, 18002, 198, 286, 80493, 829, 2343, 198, 310, 80493, 1815, 261, 18002, 198, 310, 22612, 242, 16991, 1815, 261, 4452, 18002, 198, 286, 80493, 19424, 20942, 198, 310, 80493, 4763, 198, 394, 22612, 242, 16991, 4763, 18002, 198, 310, 80493, 23404, 2972, 18002, 198, 310, 80493, 23404, 2972, 4452, 18002, 198, 310, 80493, 2193, 18002, 198, 310, 80493, 2193, 4452, 18002, 198, 310, 80493, 4772, 2972, 18002, 198, 310, 22612, 242, 16991, 4772, 2972, 4452, 18002, 198, 286, 80493, 274, 18, 20942, 198, 310, 80493, 2943, 18002, 198, 310, 80493, 2943, 4452, 18002, 198, 310, 80493, 2193, 18002, 198, 310, 22612, 242, 16991, 274, 18, 18002, 198, 286, 80493, 12455, 20942, 198, 310, 80493, 61945, 21324, 198, 310, 80493, 2943, 18002, 198, 310, 80493, 2943, 4452, 18002, 198, 310, 22612, 242, 16991, 2193, 18002, 198, 286, 80493, 5704, 20942, 198, 310, 80493, 28431, 198, 394, 22612, 242, 16991, 2943, 4452, 18002, 198, 310, 80493, 61945, 21324, 198, 310, 80493, 2943, 18002, 198, 310, 80493, 2943, 4452, 18002, 198, 310, 80493, 2193, 18002, 198, 310, 22612, 242, 16991, 10802, 18002, 198, 286, 80493, 1273, 3848, 198, 310, 80493, 2943, 18002, 198, 310, 80493, 2943, 4452, 18002, 198, 310, 80493, 2193, 18002, 198, 310, 80493, 3538, 18002, 198, 310, 22612, 242, 16991, 3538, 4452, 18002, 198, 286, 80493, 2943, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 18021, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 6645, 18002, 198, 286, 80493, 6645, 4452, 18002, 198, 286, 80493, 60829, 18002, 198, 286, 80493, 2606, 18002, 198, 286, 80493, 3059, 18002, 198, 286, 22612, 242, 16991, 42166, 18002, 198, 262, 80493, 23404, 17168, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 2053, 85424, 18002, 198, 286, 22612, 242, 16991, 2053, 85424, 4452, 18002, 198, 262, 80493, 6644, 615, 4466, 198, 286, 80493, 5476, 67, 198, 310, 80493, 2943, 18002, 198, 310, 22612, 242, 16991, 2193, 18002, 198, 286, 80493, 26588, 75841, 198, 310, 80493, 21348, 18002, 198, 310, 22612, 242, 16991, 2193, 18002, 198, 286, 22612, 242, 16991, 8633, 18002, 198, 262, 80493, 26588, 29172, 198, 286, 80493, 8317, 198, 310, 80493, 5975, 18002, 198, 310, 80493, 926, 35403, 261, 18002, 198, 310, 80493, 926, 35403, 261, 4452, 18002, 198, 310, 80493, 25991, 35403, 261, 18002, 198, 310, 80493, 25991, 35403, 261, 4452, 18002, 198, 310, 80493, 7497, 18002, 198, 310, 22612, 242, 16991, 8317, 261, 18002, 198, 286, 80493, 76899, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 83232, 18002, 198, 286, 80493, 83232, 84245, 4452, 18002, 198, 286, 80493, 11160, 18002, 198, 286, 80493, 11160, 4452, 18002, 198, 286, 80493, 12716, 18002, 198, 286, 80493, 12716, 4452, 18002, 198, 286, 80493, 5819, 20602, 18002, 198, 286, 80493, 5819, 20602, 4452, 18002, 198, 286, 80493, 1273, 6031, 4452, 18002, 198, 286, 22612, 242, 16991, 66563, 18002, 198, 262, 80493, 702, 4079, 287, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 27879, 34683, 18002, 198, 286, 80493, 27879, 34683, 4452, 18002, 198, 286, 80493, 10058, 18002, 198, 286, 80493, 10058, 4452, 18002, 198, 286, 22612, 242, 16991, 7497, 18002, 198, 262, 80493, 2820, 2028, 198, 286, 80493, 40915, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 4051, 18002, 198, 286, 80493, 4051, 4452, 18002, 198, 286, 80493, 1140, 18002, 198, 286, 80493, 8718, 18002, 198, 286, 80493, 8718, 4452, 18002, 198, 286, 80493, 27879, 18002, 198, 286, 80493, 27879, 8727, 18002, 198, 286, 80493, 27879, 8727, 4452, 18002, 198, 286, 80493, 27879, 4452, 18002, 198, 286, 80493, 1584, 18002, 198, 286, 80493, 1584, 4452, 18002, 198, 286, 22612, 242, 16991, 7497, 18002, 198, 262, 80493, 3468, 1607, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 1140, 18002, 198, 286, 22612, 242, 16991, 1140, 4452, 18002, 198, 262, 80493, 18026, 86, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 65551, 57445, 18002, 198, 286, 80493, 65551, 57445, 4452, 18002, 198, 286, 22612, 242, 16991, 1273, 6031, 4452, 18002, 198, 262, 80493, 2270, 466, 69, 11706, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 2193, 4452, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 13823, 18002, 198, 286, 22612, 242, 16991, 13823, 4452, 18002, 198, 262, 80493, 29679, 198, 286, 80493, 29679, 18002, 198, 286, 22612, 242, 16991, 29679, 4452, 18002, 198, 262, 80493, 52995, 44848, 198, 286, 80493, 4772, 9995, 1693, 198, 310, 80493, 31558, 18002, 198, 310, 80493, 31558, 4452, 18002, 198, 310, 80493, 37664, 18002, 198, 310, 80493, 1299, 6295, 18002, 198, 310, 80493, 1299, 6295, 4452, 18002, 198, 310, 80493, 3553, 18002, 198, 310, 80493, 3553, 4452, 18002, 198, 310, 80493, 3383, 18002, 198, 310, 22612, 242, 16991, 7497, 18002, 198, 286, 80493, 3270, 1419, 198, 310, 80493, 31558, 18002, 198, 310, 80493, 31558, 4452, 18002, 198, 310, 80493, 37664, 18002, 198, 310, 80493, 3239, 18002, 198, 310, 80493, 3553, 18002, 198, 310, 80493, 3553, 4452, 18002, 198, 310, 80493, 3383, 18002, 198, 310, 22612, 242, 16991, 7497, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 5975, 18002, 198, 286, 80493, 24099, 18002, 198, 286, 80493, 6645, 18002, 198, 286, 22612, 242, 16991, 6645, 4452, 18002, 198, 262, 80493, 3553, 198, 286, 80493, 2331, 198, 310, 80493, 733, 18002, 198, 310, 80493, 5975, 18002, 198, 310, 80493, 1034, 9078, 18002, 198, 310, 80493, 1034, 9078, 4452, 18002, 198, 310, 80493, 1034, 5376, 18002, 198, 310, 80493, 1034, 5376, 4452, 18002, 198, 310, 80493, 1034, 10287, 18002, 198, 310, 80493, 1034, 10287, 4452, 18002, 198, 310, 80493, 1034, 6443, 18189, 18002, 198, 310, 80493, 1034, 14809, 18002, 198, 310, 80493, 37664, 18002, 198, 310, 22612, 242, 16991, 1273, 6031, 4452, 18002, 198, 286, 80493, 11160, 198, 310, 80493, 1537, 12759, 3009, 18002, 198, 310, 80493, 1537, 12759, 3009, 4452, 18002, 198, 310, 80493, 11160, 18002, 198, 310, 80493, 22334, 18002, 198, 310, 80493, 22334, 4452, 18002, 198, 310, 80493, 30575, 5490, 18002, 198, 310, 22612, 242, 16991, 30575, 5490, 4452, 18002, 198, 286, 80493, 2162, 35939, 14809, 18002, 198, 286, 80493, 2162, 35939, 14809, 4452, 18002, 198, 286, 80493, 2162, 14809, 18002, 198, 286, 80493, 2162, 14809, 4452, 18002, 198, 286, 80493, 6500, 14809, 18002, 198, 286, 80493, 21290, 18002, 198, 286, 80493, 21290, 4452, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 1034, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 4285, 14809, 18002, 198, 286, 80493, 4285, 14809, 4452, 18002, 198, 286, 80493, 7497, 18002, 198, 286, 80493, 8135, 14809, 18002, 198, 286, 22612, 242, 16991, 12439, 18002, 198, 262, 80493, 30575, 198, 286, 80493, 4179, 49710, 440, 684, 198, 310, 80493, 2193, 18002, 198, 310, 80493, 4357, 18002, 198, 310, 80493, 16851, 18002, 198, 310, 80493, 16851, 4452, 18002, 198, 310, 80493, 7497, 18002, 198, 310, 22612, 242, 16991, 4094, 18002, 198, 286, 80493, 28809, 198, 310, 80493, 21483, 4584, 198, 394, 80493, 7177, 18002, 198, 394, 22612, 242, 16991, 7177, 4452, 18002, 198, 310, 80493, 5353, 261, 198, 394, 80493, 5353, 261, 18002, 198, 394, 22612, 242, 16991, 5353, 261, 4452, 18002, 198, 310, 80493, 4534, 198, 394, 80493, 2193, 18002, 198, 394, 80493, 4534, 18002, 198, 394, 80493, 4534, 4452, 18002, 198, 394, 80493, 12418, 45159, 18002, 198, 394, 80493, 12418, 45159, 4452, 18002, 198, 394, 80493, 37664, 18002, 198, 394, 80493, 1424, 927, 4407, 18002, 198, 394, 80493, 1424, 927, 4407, 4452, 18002, 198, 394, 22612, 242, 16991, 1943, 18002, 198, 310, 80493, 4534, 2454, 198, 394, 80493, 2193, 18002, 198, 394, 80493, 1584, 18002, 198, 394, 22612, 242, 16991, 1584, 4452, 18002, 198, 310, 80493, 6845, 198, 394, 80493, 4349, 66, 485, 719, 198, 503, 80493, 1638, 22773, 18002, 198, 503, 80493, 6645, 18002, 198, 503, 80493, 6645, 4452, 18002, 198, 503, 80493, 4842, 18002, 198, 503, 22612, 242, 16991, 8848, 267, 12978, 22773, 18002, 198, 394, 80493, 2193, 18002, 198, 394, 80493, 38799, 18002, 198, 394, 80493, 38799, 4452, 18002, 198, 394, 80493, 14397, 18002, 198, 394, 80493, 12811, 13996, 2566, 18002, 198, 394, 80493, 12811, 13996, 2566, 4452, 18002, 198, 394, 22612, 242, 16991, 30575, 12759, 1670, 28058, 18002, 198, 310, 80493, 30575, 839, 198, 394, 22612, 242, 16991, 5925, 18002, 198, 310, 80493, 42112, 18002, 198, 310, 80493, 42112, 4452, 18002, 198, 310, 80493, 2193, 18002, 198, 310, 80493, 54717, 18002, 198, 310, 80493, 4357, 18002, 198, 310, 80493, 4357, 4452, 18002, 198, 310, 80493, 18646, 18002, 198, 310, 80493, 28809, 18002, 198, 310, 80493, 28809, 4452, 18002, 198, 310, 80493, 1584, 18002, 198, 310, 22612, 242, 16991, 1273, 6031, 4452, 18002, 198, 286, 22612, 242, 16991, 5819, 198, 310, 80493, 8315, 16172, 198, 394, 80493, 37664, 18002, 198, 394, 80493, 9666, 18002, 198, 394, 80493, 30575, 18002, 198, 394, 80493, 30575, 42873, 18002, 198, 394, 80493, 30575, 42873, 4452, 18002, 198, 394, 22612, 242, 16991, 30575, 4452, 18002, 198, 310, 80493, 6238, 16172, 198, 394, 80493, 30575, 18002, 198, 394, 80493, 30575, 42873, 18002, 198, 394, 80493, 30575, 42873, 4452, 18002, 198, 394, 22612, 242, 16991, 30575, 4452, 18002, 198, 310, 80493, 4349, 66, 485, 998, 198, 394, 80493, 4147, 18002, 198, 394, 22612, 242, 16991, 1034, 18002, 198, 310, 80493, 37664, 18002, 198, 310, 80493, 5819, 18002, 198, 310, 80493, 30575, 3109, 18002, 198, 310, 22612, 242, 16991, 30575, 3109, 4452, 18002, 198, 262, 22612, 242, 16991, 41730, 198, 286, 22612, 242, 16991, 2193, 18002, 198, 144663, 16991, 2205, 1999, 198, 262, 80493, 17063, 198, 286, 80493, 220, 15, 15, 15, 15, 16, 9372, 9995, 1693, 6137, 18002, 198, 286, 22612, 242, 16991, 220, 15, 15, 15, 15, 17, 9165, 1419, 6137, 18002, 198, 262, 80493, 2193, 18002, 198, 262, 80493, 4625, 18002, 198, 262, 22612, 242, 16991, 37664, 18002, 198, 144663, 16991, 16734, 198, 262, 80493, 2193, 18002, 198, 262, 80493, 8386, 18002, 198, 262, 80493, 296, 18, 18002, 198, 262, 80493, 16734, 18002, 198, 262, 22612, 242, 16991, 10472, 67, 18002, 198, 144663, 16991, 68909, 198, 262, 80493, 8315, 14, 8092, 2972, 198, 286, 22612, 242, 16991, 2943, 18002, 198, 262, 80493, 1936, 21492, 198, 286, 80493, 4772, 2972, 198, 310, 80493, 2943, 18002, 198, 310, 22612, 242, 16991, 9109, 18002, 198, 286, 80493, 4772, 4314, 198, 310, 80493, 1034, 4314, 18002, 198, 310, 22612, 242, 16991, 3553, 18002, 198, 286, 22612, 242, 16991, 4772, 1313, 198, 310, 22612, 242, 16991, 24036, 48943, 18002, 198, 262, 80493, 3051, 198, 286, 80493, 19163, 198, 310, 80493, 342, 4837, 20942, 198, 394, 22612, 242, 16991, 342, 4837, 18002, 198, 310, 80493, 305, 34378, 20942, 21808, 71, 34378, 198, 394, 22612, 242, 16991, 2943, 18002, 198, 310, 80493, 274, 18, 20942, 198, 394, 22612, 242, 16991, 274, 18, 18002, 198, 310, 22612, 242, 16991, 2943, 18002, 198, 286, 80493, 6644, 615, 4466, 198, 310, 80493, 5476, 67, 198, 394, 22612, 242, 16991, 2943, 18002, 198, 310, 80493, 26588, 75841, 198, 394, 22612, 242, 16991, 26588, 2972, 18002, 198, 310, 22612, 242, 16991, 8633, 18002, 198, 286, 80493, 26588, 29172, 14, 24188, 198, 310, 22612, 242, 16991, 6532, 17366, 596, 802, 261, 18002, 198, 286, 80493, 702, 4079, 287, 198, 310, 80493, 10058, 18002, 198, 310, 22612, 242, 16991, 55727, 18002, 198, 286, 80493, 2820, 2028, 198, 310, 80493, 40915, 18002, 198, 310, 80493, 4051, 18002, 198, 310, 22612, 242, 16991, 27879, 5315, 18002, 198, 286, 80493, 3468, 1607, 198, 310, 22612, 242, 16991, 1140, 18002, 198, 286, 80493, 52995, 44848, 198, 310, 80493, 4772, 9995, 1693, 198, 394, 22612, 242, 16991, 8699, 16112, 18002, 198, 310, 80493, 31558, 18002, 198, 310, 80493, 6645, 18002, 198, 310, 80493, 3553, 18002, 198, 310, 22612, 242, 16991, 3383, 18002, 198, 286, 80493, 3553, 198, 310, 22612, 242, 16991, 1461, 485, 329, 18189, 18002, 198, 286, 22612, 242, 16991, 30575, 2687, 15222, 198, 310, 80493, 18646, 4788, 15222, 18002, 198, 310, 22612, 242, 16991, 28809, 18002, 198, 262, 80493, 6238, 34827, 2972, 198, 286, 80493, 2943, 18002, 198, 286, 80493, 2943, 48943, 18002, 198, 286, 80493, 10652, 2972, 18002, 198, 286, 80493, 10652, 19979, 18002, 198, 286, 22612, 242, 16991, 9109, 18002, 198, 262, 80493, 28331, 198, 286, 80493, 21483, 2972, 198, 310, 22612, 242, 16991, 2943, 18002, 198, 286, 80493, 2270, 466, 69, 509, 1451, 198, 310, 22612, 242, 16991, 2943, 18002, 198, 286, 80493, 6238, 4314, 198, 310, 22612, 242, 16991, 3553, 18002, 198, 286, 22612, 242, 16991, 14397, 4314, 198, 310, 22612, 242, 16991, 3553, 18002, 198, 262, 22612, 242, 16991, 12439, 198, 286, 80493, 7681, 454, 198, 310, 80493, 9873, 8202, 18002, 198, 310, 22612, 242, 16991, 3383, 41736, 18002, 198, 286, 22612, 242, 16991, 54320, 628, 321, 198, 310, 22612, 242, 16991, 4778, 32981, 712, 18002, 198, 144663, 16991, 70482, 198, 262, 80493, 2193, 198, 286, 80493, 8315, 18002, 198, 286, 80493, 2331, 18002, 198, 286, 80493, 1936, 21492, 18002, 198, 286, 80493, 1638, 18002, 198, 286, 80493, 6238, 18002, 198, 286, 80493, 13291, 18002, 198, 286, 22612, 242, 16991, 28331, 18002, 198, 262, 22612, 242, 16991, 70482, 18002, 198, 144663, 16991, 6238, 198, 262, 80493, 23404, 2972, 198, 286, 80493, 2943, 18002, 198, 286, 80493, 10652, 8179, 18002, 198, 286, 80493, 5975, 18002, 198, 286, 80493, 9109, 18002, 198, 286, 22612, 242, 16991, 82257, 18002, 198, 262, 80493, 23404, 4030, 198, 286, 80493, 10652, 8179, 4452, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 3538, 18002, 198, 286, 80493, 3538, 4452, 18002, 198, 286, 80493, 1273, 6031, 4452, 18002, 198, 286, 80493, 82257, 18002, 198, 286, 80493, 12439, 18002, 198, 286, 22612, 242, 16991, 12439, 4452, 18002, 198, 262, 80493, 5439, 198, 286, 80493, 5439, 18002, 198, 286, 22612, 242, 16991, 2193, 18002, 198, 262, 22612, 242, 16991, 1887, 18002, 198, 144663, 16991, 18433, 4322, 17, 79, 198, 262, 22612, 242, 16991, 281, 17, 79, 57322, 198, 144663, 16991, 13291, 198, 262, 80493, 5439, 198, 286, 80493, 5439, 18002, 198, 286, 22612, 242, 16991, 2193, 18002, 198, 262, 80493, 13291, 4030, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 90477, 18002, 198, 286, 80493, 855, 19963, 18002, 198, 286, 80493, 19424, 19691, 18002, 198, 286, 80493, 3538, 18002, 198, 286, 80493, 3538, 4452, 18002, 198, 286, 22612, 242, 16991, 1273, 6031, 4452, 18002, 198, 262, 80493, 19424, 9199, 198, 286, 80493, 2193, 18002, 198, 286, 22612, 242, 16991, 3538, 18002, 198, 262, 22612, 242, 16991, 1887, 18002, 198, 144663, 16991, 19502, 198, 262, 22612, 242, 16991, 23789, 14120, 2395, 198, 144663, 16991, 1273, 198, 262, 80493, 10135, 198, 286, 80493, 1304, 2327, 18725, 3288, 198, 286, 80493, 6813, 7197, 198, 286, 80493, 390, 723, 477, 7197, 198, 286, 80493, 1273, 15467, 7197, 198, 286, 80493, 1273, 814, 13659, 7197, 198, 286, 80493, 1273, 25533, 1693, 7197, 198, 286, 80493, 82257, 7197, 198, 286, 22612, 242, 16991, 12439, 7197, 198, 262, 22612, 242, 16991, 55026, 198, 286, 80493, 61945, 21324, 198, 286, 22612, 242, 16991, 79451, 666, 15546, 2395, 198, 144663, 16991, 7375, 198, 262, 80493, 9544, 198, 286, 80493, 6815, 261, 198, 310, 80493, 1887, 18002, 198, 310, 80493, 11540, 18002, 198, 310, 22612, 242, 16991, 6815, 18002, 198, 286, 80493, 18646, 198, 310, 22612, 242, 16991, 1887, 18002, 198, 286, 80493, 19038, 198, 310, 80493, 61681, 67313, 14738, 7197, 198, 310, 22612, 242, 16991, 4194, 49443, 14738, 7197, 198, 286, 80493, 1273, 3848, 198, 310, 22612, 242, 16991, 1887, 18002, 198, 286, 22612, 242, 16991, 41048, 198, 310, 80493, 1099, 198, 394, 80493, 15877, 198, 503, 22612, 242, 16991, 906, 4327, 198, 394, 80493, 5272, 198, 503, 22612, 242, 16991, 906, 2564, 198, 394, 22612, 242, 16991, 6994, 198, 503, 22612, 242, 16991, 906, 2857, 198, 310, 80493, 1887, 18002, 198, 310, 22612, 242, 16991, 3538, 18002, 198, 262, 22612, 242, 16991, 3051, 198, 286, 80493, 2168, 198, 310, 22612, 242, 16991, 2168, 18002, 198, 286, 22612, 242, 16991, 55026, 1314, 18002, 198, 144663, 16991, 28331, 198, 262, 80493, 21483, 2972, 198, 286, 22612, 242, 16991, 2943, 18002, 198, 262, 80493, 5439, 198, 286, 80493, 5439, 18002, 198, 286, 22612, 242, 16991, 2193, 18002, 198, 262, 80493, 2270, 466, 69, 509, 1451, 198, 286, 80493, 2943, 18002, 198, 286, 22612, 242, 16991, 7497, 18002, 198, 262, 80493, 6238, 4314, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 3553, 18002, 198, 286, 22612, 242, 16991, 3553, 4452, 18002, 198, 262, 80493, 14397, 10661, 411, 34790, 198, 286, 80493, 79314, 22773, 18002, 198, 286, 80493, 79314, 22773, 4452, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 1638, 22773, 18002, 198, 286, 80493, 1638, 22773, 4452, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 14397, 10661, 411, 34790, 18002, 198, 286, 22612, 242, 16991, 14397, 10661, 411, 34790, 4452, 18002, 198, 262, 80493, 14397, 4314, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 2205, 18002, 198, 286, 80493, 2205, 4452, 18002, 198, 286, 80493, 20870, 18002, 198, 286, 80493, 20870, 4452, 18002, 198, 286, 80493, 3553, 18002, 198, 286, 22612, 242, 16991, 7497, 18002, 198, 262, 80493, 90982, 2836, 198, 286, 80493, 21483, 18002, 198, 286, 80493, 21483, 4452, 18002, 198, 286, 80493, 2193, 18002, 198, 286, 80493, 37664, 18002, 198, 286, 80493, 2270, 466, 824, 18002, 198, 286, 80493, 2270, 466, 824, 4452, 18002, 198, 286, 80493, 3538, 18002, 198, 286, 22612, 242, 16991, 1273, 6031, 4452, 18002, 198, 262, 22612, 242, 16991, 1887, 18002, 198, 144663, 16991, 12439, 198, 262, 80493, 33394, 198, 286, 80493, 4568, 2015, 18002, 198, 286, 22612, 242, 16991, 4568, 2015, 4452, 18002, 198, 262, 80493, 2699, 746, 1314, 198, 286, 22612, 242, 16991, 2699, 746, 1314, 18002, 198, 262, 80493, 2193, 1314, 198, 286, 80493, 1273, 691, 198, 310, 80493, 2193, 9432, 35, 198, 394, 80493, 2331, 33406, 198, 394, 80493, 5670, 77763, 17, 33406, 198, 394, 22612, 242, 16991, 5670, 33406, 198, 310, 80493, 2193, 9432, 36, 198, 394, 80493, 2331, 33406, 198, 394, 80493, 5670, 77763, 17, 33406, 198, 394, 22612, 242, 16991, 5670, 33406, 198, 310, 80493, 5248, 198, 394, 80493, 2193, 32, 198, 503, 80493, 2331, 33406, 198, 503, 80493, 5670, 77763, 17, 33406, 198, 503, 22612, 242, 16991, 5670, 33406, 198, 394, 80493, 2193, 33, 198, 503, 80493, 2331, 33406, 198, 503, 80493, 5670, 77763, 17, 33406, 198, 503, 22612, 242, 16991, 5670, 33406, 198, 394, 80493, 2193, 34, 198, 503, 80493, 2331, 33406, 198, 503, 80493, 5670, 77763, 17, 33406, 198, 503, 22612, 242, 16991, 5670, 33406, 198, 394, 80493, 2193, 37, 198, 503, 80493, 2331, 33406, 198, 503, 22612, 242, 16991, 5670, 77763, 17, 33406, 198, 394, 22612, 242, 16991, 2193, 38, 198, 503, 80493, 2331, 33406, 198, 503, 80493, 5670, 77763, 17, 33406, 198, 503, 22612, 242, 16991, 5670, 33406, 198, 310, 22612, 242, 16991, 3175, 198, 394, 80493, 2331, 33406, 198, 394, 80493, 23594, 33406, 198, 394, 22612, 242, 16991, 1273, 33406, 198, 286, 80493, 2193, 18002, 198, 286, 22612, 242, 16991, 2193, 4452, 18002, 198, 262, 80493, 7681, 454, 198, 286, 80493, 9873, 88536, 18002, 198, 286, 80493, 9873, 88536, 4452, 18002, 198, 286, 80493, 4568, 2015, 18002, 198, 286, 80493, 4568, 2015, 4452, 18002, 198, 286, 80493, 1681, 11529, 18002, 198, 286, 22612, 242, 16991, 1681, 11529, 4452, 18002, 198, 262, 80493, 53758, 1306, 1314, 198, 286, 80493, 53758, 1306, 1314, 18002, 198, 286, 22612, 242, 16991, 53758, 1306, 1314, 4452, 18002, 198, 262, 80493, 26588, 1314, 198, 286, 80493, 26588, 1314, 18002, 198, 286, 80493, 26588, 1314, 4452, 18002, 198, 286, 22612, 242, 16991, 37664, 18002, 198, 262, 80493, 1848, 1314, 198, 286, 80493, 1848, 1314, 18002, 198, 286, 22612, 242, 16991, 1848, 1314, 4452, 18002, 198, 262, 80493, 5181, 1314, 198, 286, 22612, 242, 16991, 5181, 1314, 18002, 198, 262, 80493, 7013, 198, 286, 22612, 242, 16991, 7013, 18002, 198, 262, 80493, 17364, 198, 286, 80493, 10619, 10841, 18002, 198, 286, 22612, 242, 16991, 10619, 10841, 4452, 18002, 198, 262, 80493, 54320, 628, 321, 198, 286, 80493, 1182, 1847, 18002, 198, 286, 80493, 54320, 628, 321, 18002, 198, 286, 80493, 54320, 628, 321, 4452, 18002, 198, 286, 80493, 55026, 18002, 198, 286, 22612, 242, 16991, 55026, 4452, 18002, 198, 262, 80493, 11446, 198, 286, 80493, 2193, 18002, 198, 286, 22612, 242, 16991, 8844, 18002, 198, 262, 80493, 34679, 2186, 198, 286, 80493, 2415, 18002, 198, 286, 22612, 242, 16991, 2415, 4452, 18002, 198, 262, 80493, 1487, 198, 286, 80493, 1487, 18002, 198, 286, 22612, 242, 16991, 5925, 18002, 198, 262, 80493, 1833, 2141, 198, 286, 80493, 1833, 2141, 18002, 198, 286, 22612, 242, 16991, 1833, 2141, 4452, 18002, 198, 262, 80493, 7860, 1314, 198, 286, 80493, 7860, 1314, 18002, 198, 286, 22612, 242, 16991, 7860, 1314, 4452, 18002, 198, 262, 80493, 4179, 1314, 198, 286, 22612, 242, 16991, 4179, 1314, 18002, 198, 262, 80493, 2643, 1314, 198, 286, 22612, 242, 16991, 2643, 1314, 18002, 198, 262, 80493, 10382, 1314, 198, 286, 22612, 242, 16991, 10382, 1314, 18002, 198, 262, 80493, 25991, 1314, 198, 286, 80493, 60146, 7573, 18002, 198, 286, 80493, 60146, 7573, 4452, 18002, 198, 286, 80493, 25991, 1314, 18002, 198, 286, 22612, 242, 16991, 25991, 1314, 4452, 18002, 198, 262, 80493, 914, 746, 198, 286, 22612, 242, 16991, 914, 746, 18002, 198, 262, 80493, 12811, 1314, 198, 286, 80493, 31532, 18002, 198, 286, 22612, 242, 16991, 31532, 4452, 18002, 198, 262, 80493, 1273, 1314, 198, 286, 22612, 242, 16991, 1273, 1314, 18002, 198, 262, 22612, 242, 16991, 882, 1314, 198, 286, 80493, 9021, 18002, 198, 286, 80493, 9021, 4452, 18002, 198, 286, 80493, 882, 1314, 18002, 198, 286, 22612, 242, 16991, 882, 1314, 4452, 18002, 198, 144663, 16991, 7405, 1192, 198, 144663, 16991, 61945, 21324, 198, 144663, 16991, 23789, 14120, 33936, 198, 144663, 16991, 8502, 11527, 82, 3909, 198, 144798, 16991, 8502, 81094, 3909, 198, 522, 4987, 38283, 1339, 4624, 3988, 510, 27, 23319, 1269, 397, 29870, 4698, 81, 3366, 198, 522, 23319, 1269, 1339, 4624, 70934, 7933, 510, 27, 42909, 2638, 397, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 198, 27, 42909, 2638, 1339, 29019, 18320, 389, 279, 12542, 594, 5944, 323, 8794, 510, 27, 35499, 42682, 397, 2, 27612, 16447, 3366, 220, 73345, 111116, 106379, 101042, 271, 565, 220, 16, 13, 220, 106871, 31905, 101978, 271, 56296, 16447, 3366, 54851, 46944, 334, 33447, 78882, 112896, 72448, 14, 102724, 334, 73345, 3837, 102093, 100751, 40549, 18137, 243, 250, 65101, 9370, 393, 17, 47, 58657, 28291, 1773, 105464, 102226, 104587, 52334, 72448, 3837, 102298, 101213, 48934, 47874, 110195, 3837, 100629, 105896, 106379, 70500, 5373, 85767, 39352, 5373, 104814, 5373, 81705, 33108, 102121, 99788, 1773, 73345, 100176, 110053, 3837, 102298, 20, 24, 22, 18947, 88086, 26898, 3837, 101909, 106888, 112896, 72448, 3407, 565, 220, 17, 13, 220, 46100, 44956, 99706, 99778, 14, 101042, 271, 14374, 51461, 116, 63109, 98380, 33108, 99558, 11622, 26355, 198, 12, 3070, 47, 17, 47, 40549, 18137, 243, 250, 65101, 17177, 28291, 334, 5122, 104210, 6495, 67892, 66521, 237, 96422, 9370, 102202, 100811, 65101, 17177, 28291, 72448, 198, 12, 3070, 44636, 30440, 106375, 33071, 334, 5122, 100143, 104102, 220, 16, 20, 11, 15, 15, 15, 26853, 108, 110293, 9370, 103414, 3837, 102443, 17177, 28291, 100381, 220, 16, 15, 15, 220, 110156, 23404, 198, 12, 3070, 105063, 99718, 100143, 334, 5122, 100143, 99960, 103414, 42192, 99644, 105173, 33108, 101312, 105653, 33447, 78882, 198, 12, 3070, 44636, 107769, 33071, 334, 5122, 42192, 23990, 27442, 105716, 9370, 112896, 106379, 70500, 271, 14374, 89982, 30534, 110569, 102064, 198, 12, 3070, 10850, 334, 5122, 99558, 110569, 102064, 3837, 100751, 55338, 100185, 110195, 198, 12, 3070, 30280, 334, 5122, 100751, 102705, 81705, 33108, 102011, 100037, 21894, 198, 12, 3070, 25287, 13710, 334, 5122, 100751, 102121, 33108, 104004, 100037, 21894, 198, 12, 3070, 56, 31102, 334, 5122, 100751, 85767, 26898, 198, 12, 3070, 29475, 14, 5835, 11295, 1220, 334, 5122, 100751, 118417, 102011, 271, 14374, 91417, 60949, 102724, 33108, 44956, 105537, 198, 12, 3070, 71356, 104516, 334, 5122, 104210, 10130, 14, 82354, 58143, 35926, 91282, 393, 17, 47, 66521, 237, 96422, 198, 12, 3070, 105653, 33447, 78882, 334, 5122, 50, 18, 5373, 38, 6412, 5373, 39, 62266, 5373, 35, 13659, 32112, 10236, 255, 231, 101312, 33447, 78882, 100143, 198, 12, 3070, 74393, 334, 5122, 81772, 9909, 100751, 104603, 108418, 32108, 23083, 12, 3070, 109300, 104001, 13343, 334, 5122, 35, 13659, 5373, 4502, 67, 18137, 249, 228, 12857, 198, 12, 3070, 104814, 104118, 334, 5122, 51, 745, 5373, 44, 18, 6567, 234, 229, 30844, 72448, 198, 12, 3070, 8903, 77128, 334, 5122, 57, 391, 89254, 77835, 32108, 8903, 77128, 198, 12, 3070, 81705, 334, 5122, 10850, 4891, 236, 253, 21287, 81705, 488, 13027, 27764, 271, 565, 220, 18, 13, 220, 73345, 106130, 106379, 100261, 99759, 14, 101042, 271, 14374, 18137, 40419, 99371, 106130, 100166, 198, 13874, 3989, 29870, 4698, 81, 3366, 5894, 144663, 16991, 8315, 14, 1843, 671, 20713, 44054, 93437, 9909, 102121, 18493, 103991, 110293, 23083, 144663, 16991, 6238, 14, 688, 671, 17116, 44054, 93437, 9909, 105223, 105975, 92374, 23083, 144663, 16991, 28331, 14, 260, 671, 40179, 44054, 93437, 9909, 47, 17, 47, 66521, 237, 47872, 31548, 23083, 144663, 16991, 13291, 14, 1843, 671, 32778, 44054, 93437, 9909, 35, 13659, 62579, 49026, 20742, 107736, 23083, 144663, 16991, 1936, 21492, 14, 257, 671, 7854, 12, 1552, 44054, 93437, 9909, 105151, 100261, 99759, 47874, 23083, 144663, 16991, 6200, 14, 310, 671, 51461, 116, 63109, 20074, 100166, 33108, 107018, 198, 144663, 16991, 3051, 14, 1797, 671, 34369, 109, 71743, 44956, 33108, 102011, 198, 144663, 16991, 2193, 14, 688, 671, 18137, 44104, 21596, 26898, 108421, 198, 144663, 16991, 26588, 14, 688, 671, 40549, 18137, 243, 250, 65101, 104004, 26898, 198, 144663, 16991, 10295, 14, 286, 671, 18137, 225, 101, 100463, 19793, 26355, 33108, 100013, 99719, 198, 144663, 16991, 33765, 14, 310, 671, 66374, 62042, 90867, 20742, 198, 144663, 16991, 7375, 14, 1843, 671, 42849, 227, 99262, 102011, 33108, 118417, 198, 144663, 16991, 1273, 14, 310, 671, 18137, 249, 228, 12857, 81705, 198, 144663, 16991, 12439, 14, 1843, 671, 220, 105600, 102011, 44956, 198, 144798, 16991, 68909, 14, 1843, 671, 98313, 66635, 105717, 64429, 198, 13874, 19324, 14374, 91417, 60949, 85767, 26898, 198, 12, 3070, 8078, 1192, 334, 5122, 99960, 100133, 104004, 5373, 35, 13659, 18137, 243, 250, 65101, 104004, 5373, 81705, 33108, 102121, 198, 12, 3070, 1676, 23540, 41466, 334, 5122, 99200, 110195, 105549, 85767, 108421, 198, 12, 3070, 51899, 3663, 5122, 42, 29827, 18137, 225, 101, 100463, 85767, 198, 12, 3070, 28648, 3663, 5122, 109300, 32108, 85767, 271, 14374, 220, 106008, 106130, 98380, 101042, 271, 820, 51461, 116, 63109, 110195, 106130, 198, 12, 3070, 8092, 3663, 5122, 103991, 110293, 101913, 101259, 47874, 3837, 101884, 40549, 62579, 49026, 20742, 107736, 198, 12, 3070, 8611, 3663, 5122, 105223, 105975, 92374, 3837, 105653, 33108, 17177, 28291, 23404, 62262, 198, 12, 3070, 50395, 3663, 5122, 47, 17, 47, 10236, 121, 239, 68065, 102020, 31548, 3837, 39352, 32664, 49567, 92374, 64064, 198, 12, 3070, 22803, 3663, 5122, 52526, 101259, 3837, 54542, 100811, 65101, 83751, 105971, 105004, 6238, 198, 12, 3070, 5834, 21492, 3663, 5122, 105151, 26939, 106208, 9370, 100261, 99759, 47874, 3837, 100143, 99960, 103414, 105173, 271, 820, 80090, 107, 68878, 44956, 106130, 198, 12, 3070, 2153, 3663, 5122, 100185, 20074, 100166, 9909, 45217, 5373, 1731, 6370, 5373, 30888, 1731, 5373, 12175, 1731, 23083, 12, 3070, 2740, 3663, 5122, 101203, 98380, 44956, 9909, 33447, 78882, 105653, 5373, 47, 17, 47, 8908, 108, 225, 26381, 5373, 99722, 101071, 49567, 23083, 12, 3070, 6031, 3663, 5122, 105600, 102011, 32804, 9909, 71356, 5373, 8903, 77128, 5373, 85767, 49567, 27866, 820, 18137, 225, 101, 100463, 33108, 102011, 106130, 198, 12, 3070, 1676, 3663, 5122, 99200, 110195, 85767, 108421, 198, 12, 3070, 28648, 3663, 5122, 109300, 32108, 102121, 26898, 198, 12, 3070, 51899, 3663, 5122, 42, 29827, 18137, 225, 101, 100463, 116996, 198, 12, 3070, 51668, 3663, 5122, 100013, 99719, 33108, 102121, 19793, 26355, 198, 12, 3070, 15918, 3663, 5122, 104650, 102011, 9909, 118417, 5373, 102111, 81705, 49567, 27866, 565, 220, 19, 13, 51461, 116, 63109, 98380, 106510, 102450, 271, 14374, 89982, 30534, 98380, 105539, 198, 16, 13, 3070, 47, 17, 47, 18137, 243, 250, 65101, 17177, 28291, 334, 5122, 104210, 101624, 107898, 36556, 46448, 28029, 9370, 102202, 17177, 28291, 71356, 198, 17, 13, 3070, 42140, 33447, 78882, 105653, 100143, 334, 5122, 50, 18, 5373, 38, 6412, 5373, 39, 62266, 5373, 35, 13659, 32112, 10236, 255, 231, 198, 18, 13, 3070, 99960, 103414, 105173, 334, 5122, 100143, 104210, 104190, 9370, 62945, 64682, 105173, 198, 19, 13, 3070, 44636, 107769, 33071, 70500, 334, 5122, 35926, 100939, 98671, 99658, 86312, 5373, 42192, 23990, 27442, 105716, 198, 20, 13, 3070, 106875, 101048, 334, 5122, 45439, 80090, 107, 68878, 5373, 104925, 100359, 5373, 20074, 111293, 48927, 271, 14374, 6567, 248, 112, 99760, 9370, 5333, 58143, 102705, 107736, 198, 12, 3070, 35, 13659, 32112, 5333, 334, 5122, 114288, 100142, 40549, 62579, 49026, 20742, 107736, 198, 12, 3070, 9230, 25414, 5333, 334, 5122, 99200, 110195, 17881, 104516, 107736, 198, 12, 3070, 47, 17, 47, 66521, 237, 96422, 334, 5122, 104210, 6495, 67892, 43589, 35926, 91282, 101136, 198, 12, 3070, 99722, 101071, 107736, 334, 5122, 104814, 33108, 113308, 107736, 271, 565, 220, 20, 13, 220, 46100, 44956, 100403, 101042, 271, 14374, 91417, 60949, 46100, 106393, 33108, 104559, 271, 820, 51461, 116, 63109, 20074, 100166, 106393, 320, 2153, 53560, 12, 3070, 36339, 18002, 334, 5122, 33145, 17, 20, 21, 6567, 239, 246, 30534, 54542, 3837, 100143, 74393, 105653, 33108, 116951, 32108, 198, 12, 3070, 4059, 466, 824, 18002, 334, 5122, 67892, 34369, 225, 27369, 39352, 3837, 102298, 17177, 34718, 27369, 33108, 48927, 20074, 198, 12, 3070, 16537, 62, 19922, 3346, 334, 5122, 32664, 49567, 92374, 27369, 33108, 102285, 16744, 39352, 198, 12, 3070, 2733, 8296, 18002, 334, 5122, 67892, 220, 27369, 98671, 99658, 43959, 33108, 48927, 271, 820, 393, 17, 47, 8908, 108, 225, 26381, 31548, 106393, 320, 2740, 5523, 48709, 2687, 15222, 53560, 12, 3070, 63122, 18002, 334, 5122, 47, 17, 47, 10236, 121, 239, 68065, 109894, 44091, 39352, 198, 12, 3070, 12389, 18002, 334, 5122, 57621, 102474, 106293, 105359, 100674, 198, 12, 3070, 18274, 3663, 5122, 64064, 107415, 33108, 20074, 107468, 39352, 198, 12, 3070, 5148, 3663, 5122, 32664, 49567, 92374, 64064, 33108, 118376, 54542, 271, 820, 38433, 236, 78882, 105653, 106393, 320, 2740, 70020, 53560, 12, 3070, 13297, 18002, 334, 5122, 103967, 33447, 78882, 105653, 39352, 31548, 198, 12, 3070, 82, 18, 20942, 3663, 5122, 36136, 328, 18, 53497, 246, 99871, 33447, 78882, 198, 12, 3070, 70, 4837, 20942, 3663, 5122, 14444, 14817, 14693, 38433, 236, 78882, 198, 12, 3070, 71, 34378, 20942, 3663, 5122, 39, 25268, 58657, 51827, 28330, 26898, 72448, 33447, 78882, 198, 12, 3070, 29172, 20942, 3663, 5122, 35, 13659, 62579, 49026, 20742, 33447, 78882, 271, 820, 6567, 234, 223, 99379, 32108, 29258, 41321, 106393, 320, 2740, 4322, 4975, 291, 44848, 53560, 12, 3070, 13297, 18002, 334, 5122, 62945, 64682, 88802, 108069, 29258, 41321, 100674, 198, 12, 3070, 4578, 9995, 1693, 3663, 5122, 105151, 105173, 88802, 75117, 31548, 198, 12, 3070, 4934, 1419, 3663, 5122, 20074, 18397, 61443, 88802, 75117, 31548, 271, 14374, 93920, 114, 77835, 100144, 33108, 70500, 99453, 28330, 198, 16, 13, 3070, 57621, 102474, 106379, 334, 5122, 107415, 31548, 37029, 57621, 101353, 54542, 44091, 105359, 198, 17, 13, 3070, 101255, 14224, 32108, 70500, 334, 5122, 33447, 78882, 105653, 101910, 104285, 100144, 33108, 107736, 111372, 198, 18, 13, 3070, 48934, 47874, 106379, 334, 5122, 99200, 110195, 102024, 102121, 3837, 67338, 10130, 16341, 17, 47, 220, 104516, 198, 19, 13, 3070, 35926, 100939, 70500, 334, 5122, 98671, 99658, 86312, 33108, 99722, 101071, 101884, 105716, 100756, 102005, 198, 20, 13, 3070, 17177, 99371, 106379, 334, 5122, 104542, 9370, 111372, 100920, 3837, 45181, 100185, 20074, 100166, 26939, 103923, 104913, 271, 565, 220, 21, 13, 54599, 96808, 100920, 14, 100261, 99759, 271, 14374, 220, 105072, 98380, 198, 16, 13, 3070, 47, 17, 47, 18137, 243, 250, 65101, 17177, 28291, 72448, 1019, 17, 13, 3070, 42140, 33447, 78882, 105653, 100143, 1019, 18, 13, 3070, 99960, 103414, 105173, 39352, 1019, 19, 13, 3070, 104814, 33108, 113308, 72448, 56177, 14374, 220, 106587, 98380, 271, 820, 393, 17, 47, 18137, 243, 250, 65101, 17177, 28291, 72448, 198, 12, 3070, 16810, 220, 47874, 334, 5122, 35, 13659, 62579, 49026, 20742, 107736, 101884, 198, 12, 3070, 13298, 220, 47874, 334, 5122, 105975, 92374, 33108, 20074, 105653, 198, 12, 3070, 31133, 220, 47874, 334, 5122, 32664, 49567, 92374, 102020, 198, 12, 3070, 47, 17, 47, 8908, 108, 225, 26381, 31548, 334, 5122, 64064, 108069, 20074, 107468, 271, 820, 40666, 248, 33447, 78882, 105653, 100143, 198, 12, 3070, 50, 18, 38433, 236, 78882, 334, 5122, 36136, 328, 18, 18137, 249, 228, 12857, 198, 12, 3070, 38, 6412, 38433, 236, 78882, 334, 5122, 14444, 14817, 14693, 18137, 249, 228, 12857, 198, 12, 3070, 39, 62266, 38433, 236, 78882, 334, 5122, 39, 25268, 58657, 51827, 28330, 26898, 72448, 102705, 198, 12, 3070, 15603, 38433, 236, 78882, 334, 5122, 35, 13659, 62579, 49026, 20742, 102705, 198, 12, 3070, 105653, 39352, 31548, 334, 5122, 103967, 105653, 107736, 271, 820, 8908, 71933, 103414, 105173, 39352, 198, 12, 3070, 11066, 12, 1552, 220, 47874, 334, 5122, 105151, 100261, 99759, 39352, 198, 12, 3070, 16219, 220, 47874, 334, 5122, 52526, 101259, 33108, 116817, 198, 12, 3070, 105151, 105173, 334, 5122, 62945, 64682, 105173, 88802, 198, 12, 3070, 20074, 18397, 61443, 334, 5122, 99982, 24360, 20074, 108418, 32108, 271, 820, 74866, 239, 99332, 33108, 113308, 72448, 198, 12, 3070, 99722, 101071, 334, 5122, 110195, 44091, 104814, 198, 12, 3070, 104118, 104412, 334, 5122, 102111, 33108, 37029, 100787, 198, 12, 3070, 8903, 77128, 72448, 334, 5122, 100166, 32108, 8903, 77128, 65577, 198, 12, 3070, 118417, 102011, 334, 5122, 71356, 44091, 118417, 271, 14374, 220, 107049, 98380, 271, 820, 393, 17, 47, 8908, 108, 225, 26381, 31548, 110837, 198, 12, 3070, 64064, 44091, 39352, 334, 5122, 32664, 49567, 92374, 64064, 113509, 198, 12, 3070, 20074, 17177, 28291, 104238, 334, 5122, 101474, 18830, 104747, 5373, 47363, 104238, 198, 12, 3070, 71356, 57621, 54542, 334, 5122, 57621, 100394, 33108, 100030, 198, 12, 3070, 118376, 101136, 334, 5122, 32664, 49567, 92374, 101294, 48927, 271, 820, 53497, 246, 99871, 72448, 110837, 198, 12, 3070, 43815, 100246, 40623, 105653, 334, 5122, 104210, 106208, 105918, 105653, 198, 12, 3070, 99982, 24360, 39352, 334, 5122, 104603, 99982, 24360, 104238, 33108, 104886, 198, 12, 3070, 52526, 105653, 334, 5122, 104875, 52526, 26898, 39352, 198, 12, 3070, 23305, 20074, 39352, 334, 5122, 26898, 23305, 27369, 108418, 32108, 271, 565, 220, 22, 13, 19468, 251, 102569, 106098, 101086, 271, 14374, 18137, 250, 222, 30534, 100700, 104136, 107402, 198, 16, 13, 3070, 47, 17, 47, 10236, 121, 239, 68065, 105318, 334, 5122, 8344, 67892, 66521, 237, 96422, 33108, 101624, 107898, 36556, 46448, 28029, 198, 17, 13, 3070, 35, 13659, 62579, 49026, 20742, 101136, 334, 5122, 100811, 65101, 83751, 72225, 100674, 33108, 5333, 54955, 226, 99453, 198, 18, 13, 3070, 112896, 98671, 99658, 86312, 334, 5122, 118661, 98671, 99658, 33108, 118878, 107101, 198, 19, 13, 3070, 43815, 100246, 40623, 105653, 334, 5122, 104210, 106208, 105918, 99877, 75768, 198, 20, 13, 3070, 57621, 102474, 110569, 334, 5122, 62945, 64682, 57621, 54542, 100144, 271, 14374, 4891, 231, 235, 21596, 100032, 101882, 198, 16, 13, 3070, 10850, 220, 102064, 99896, 334, 5122, 117206, 5373, 115668, 5373, 107736, 70500, 198, 17, 13, 3070, 35, 13659, 4891, 253, 118, 99806, 334, 5122, 109300, 5373, 100811, 65101, 5373, 61689, 20742, 101290, 198, 18, 13, 3070, 71356, 110569, 334, 5122, 9230, 5373, 49896, 5373, 47, 17, 47, 10236, 121, 239, 68065, 198, 19, 13, 3070, 112896, 72448, 334, 5122, 31400, 10236, 238, 228, 67831, 5373, 118661, 5373, 107769, 33071, 198, 20, 13, 3070, 46324, 10236, 36097, 54658, 39352, 334, 5122, 26898, 72448, 5373, 101556, 39352, 5373, 71356, 85767, 271, 565, 220, 23, 13, 220, 46100, 100166, 101042, 271, 14374, 93920, 114, 77835, 104040, 198, 12, 3070, 48934, 47874, 106379, 334, 5122, 102024, 105646, 110195, 198, 12, 3070, 57621, 102474, 106379, 334, 5122, 62945, 64682, 57621, 54542, 198, 12, 3070, 17177, 99371, 106379, 334, 5122, 104542, 9370, 111372, 100920, 271, 14374, 51461, 116, 63109, 110195, 100145, 198, 13874, 3989, 16810, 47464, 51018, 40179, 47464, 51018, 17116, 198, 220, 77854, 286, 77854, 286, 77854, 198, 38878, 11397, 393, 17, 47, 8141, 198, 220, 77854, 198, 5793, 55260, 198, 13874, 19324, 14374, 89982, 30534, 21515, 33108, 106393, 198, 12, 3070, 38878, 334, 5122, 47, 17, 47, 10236, 121, 239, 68065, 107415, 100185, 198, 12, 3070, 21839, 334, 5122, 20074, 107468, 39352, 198, 12, 3070, 29699, 58298, 334, 5122, 105653, 33447, 78882, 39352, 198, 12, 3070, 67892, 42502, 334, 5122, 67892, 69594, 39352, 198, 12, 3070, 30888, 1731, 14, 30888, 1972, 334, 5122, 32664, 49567, 92374, 27369, 271, 565, 220, 24, 13, 62262, 88653, 101042, 271, 14374, 62262, 76837, 105946, 271, 820, 18137, 243, 250, 65101, 62189, 102054, 198, 13874, 3989, 35, 13659, 8423, 11397, 20713, 11397, 40179, 320, 45912, 32664, 49567, 92374, 340, 1797, 77854, 198, 16810, 11397, 17116, 14, 10197, 388, 320, 62189, 20074, 17177, 34718, 340, 1797, 77854, 198, 16810, 11397, 8774, 14693, 320, 99982, 24360, 340, 1797, 77854, 198, 35, 13659, 8423, 320, 31526, 100811, 65101, 20074, 340, 13874, 19324, 820, 18137, 243, 250, 65101, 52526, 102054, 198, 13874, 3989, 35, 13659, 8423, 11397, 32778, 11397, 17116, 320, 105653, 23404, 340, 503, 77854, 198, 394, 7854, 12, 1552, 320, 105653, 105151, 100261, 99759, 340, 503, 77854, 198, 394, 55260, 14693, 320, 108418, 32108, 340, 13874, 19324, 14374, 91417, 60949, 20074, 104949, 198, 12, 3070, 45217, 334, 5122, 33145, 17, 20, 21, 6567, 239, 246, 30534, 3837, 43815, 100246, 40623, 105549, 198, 12, 3070, 12175, 1731, 334, 5122, 67892, 34369, 225, 27369, 3837, 102298, 17177, 34718, 33108, 48927, 27369, 198, 12, 3070, 30888, 1731, 334, 5122, 32664, 49567, 92374, 27369, 3837, 102298, 6790, 5373, 78882, 39426, 5373, 44091, 198, 12, 3070, 1731, 6370, 334, 5122, 67892, 220, 27369, 98671, 99658, 3837, 47, 17, 47, 10236, 121, 239, 68065, 106918, 271, 565, 220, 16, 15, 13, 18137, 249, 228, 12857, 33108, 106375, 27442, 102450, 271, 14374, 55059, 240, 14224, 72448, 198, 12, 3070, 33447, 78882, 105653, 101255, 14224, 334, 5122, 67338, 104285, 100144, 61689, 100676, 105653, 33447, 78882, 198, 12, 3070, 109300, 104001, 13343, 101255, 14224, 334, 5122, 100143, 40549, 58143, 9678, 67, 198, 12, 3070, 71356, 57621, 101255, 14224, 334, 5122, 30440, 106375, 9370, 57621, 54542, 100674, 271, 14374, 220, 106961, 72448, 102705, 198, 12, 3070, 42, 29827, 18137, 249, 228, 12857, 334, 5122, 67338, 62042, 90867, 20742, 102121, 198, 12, 3070, 104814, 72448, 102705, 334, 5122, 100143, 386, 18, 5373, 16635, 35, 6567, 234, 229, 30844, 198, 12, 3070, 8903, 77128, 72448, 102705, 334, 5122, 100166, 32108, 8903, 77128, 66017, 198, 12, 3070, 11237, 14, 6484, 18137, 249, 228, 12857, 334, 5122, 35, 13659, 18137, 243, 250, 65101, 104004, 33108, 90447, 271, 565, 220, 16, 16, 13, 220, 105537, 101042, 271, 14374, 40666, 244, 32948, 105537, 198, 12, 3070, 105653, 47874, 334, 5122, 36136, 328, 18, 5373, 14444, 14817, 14693, 5373, 39, 62266, 198, 12, 3070, 74393, 334, 5122, 81772, 9909, 104603, 108418, 32108, 23083, 12, 3070, 109300, 104001, 13343, 334, 5122, 35, 13659, 5373, 4502, 67, 198, 12, 3070, 104814, 72448, 334, 5122, 44, 18, 5373, 16635, 35, 198, 12, 3070, 71356, 44956, 334, 5122, 100142, 5994, 10236, 121, 239, 68065, 44956, 271, 14374, 68739, 227, 32948, 110195, 105537, 198, 12, 3070, 5386, 11397, 5688, 334, 5122, 100185, 20074, 100166, 99250, 55338, 44956, 37029, 198, 12, 3070, 9194, 11397, 17954, 334, 5122, 44956, 105537, 105600, 102011, 32804, 198, 12, 3070, 10443, 11397, 5688, 334, 5122, 99200, 110195, 105537, 101203, 44956, 198, 12, 3070, 18200, 11397, 14563, 82, 334, 5122, 81705, 105537, 105717, 64429, 271, 565, 220, 16, 17, 13, 50042, 99257, 88653, 100261, 99759, 271, 14374, 4891, 116, 116, 88970, 20002, 102122, 271, 820, 81947, 28291, 28946, 102122, 198, 16, 13, 3070, 104603, 100013, 334, 5122, 37029, 3483, 18855, 38433, 107, 27733, 104603, 99719, 198, 17, 13, 3070, 100811, 65101, 104004, 334, 5122, 110885, 100811, 65101, 26939, 16447, 3366, 18137, 249, 228, 99430, 198, 18, 13, 3070, 100811, 65101, 102121, 334, 5122, 45181, 16447, 3366, 6567, 233, 231, 18158, 100811, 65101, 102121, 99892, 271, 820, 32181, 238, 99479, 102122, 198, 16, 13, 3070, 103414, 102121, 334, 5122, 37029, 62042, 73562, 66374, 64118, 102121, 198, 17, 13, 3070, 104814, 113308, 334, 5122, 67338, 104118, 33108, 8903, 77128, 104814, 72448, 44091, 198, 18, 13, 3070, 105716, 105853, 334, 5122, 37029, 118417, 102011, 101042, 71356, 44091, 271, 14374, 91417, 60949, 105333, 27442, 198, 12, 3070, 16810, 334, 5122, 35, 13659, 41479, 95, 17523, 78882, 104396, 105333, 198, 12, 3070, 16219, 334, 5122, 100811, 65101, 110885, 9370, 105333, 198, 12, 3070, 39, 23162, 90867, 20742, 334, 5122, 42, 29827, 18137, 225, 101, 100463, 105333, 198, 12, 3070, 85767, 26898, 334, 5122, 72448, 85767, 9370, 105333, 271, 565, 220, 16, 18, 13, 53040, 100104, 100166, 100367, 271, 14374, 4891, 119, 118, 96422, 9370, 111116, 106379, 271, 820, 220, 16, 13, 220, 73345, 99706, 99778, 198, 12, 220, 73345, 100157, 33108, 100185, 100162, 198, 12, 93920, 114, 77835, 113608, 33108, 117200, 198, 12, 88940, 104, 94299, 55286, 105866, 198, 12, 220, 106961, 104520, 9370, 104877, 271, 820, 220, 17, 13, 96155, 222, 99216, 103642, 33108, 105537, 198, 12, 84238, 244, 38507, 102064, 33108, 102724, 50404, 198, 12, 40666, 244, 32948, 105537, 33108, 102705, 101882, 198, 12, 10236, 36097, 54658, 101882, 33108, 114288, 33071, 271, 820, 220, 18, 13, 51461, 116, 63109, 106379, 70500, 198, 12, 43614, 112, 31914, 106379, 28029, 33108, 110195, 100145, 198, 12, 393, 17, 47, 10236, 121, 239, 68065, 70500, 105318, 198, 12, 62262, 88653, 33108, 100359, 88653, 101042, 198, 12, 18137, 40419, 107769, 105178, 36629, 28726, 70500, 271, 820, 220, 19, 13, 51461, 116, 63109, 110195, 118801, 271, 67331, 220, 19, 13, 16, 20713, 44054, 93437, 198, 12, 20713, 93920, 114, 77835, 33108, 104559, 198, 12, 40549, 62579, 49026, 20742, 107736, 101884, 198, 12, 393, 17, 47, 8908, 108, 225, 26381, 31548, 70500, 198, 12, 220, 104603, 105653, 39352, 198, 12, 18137, 44104, 21596, 32665, 118801, 271, 67331, 220, 19, 13, 17, 17116, 44054, 93437, 198, 12, 17116, 93920, 114, 77835, 33108, 105975, 92374, 100780, 198, 12, 4891, 241, 230, 99658, 86312, 33108, 118878, 107101, 198, 12, 38433, 236, 78882, 105653, 102705, 198, 12, 62262, 52526, 33108, 62189, 54542, 198, 12, 18137, 44104, 21596, 32665, 118801, 271, 67331, 220, 19, 13, 18, 40179, 44054, 93437, 198, 12, 40179, 93920, 114, 77835, 33108, 102020, 98380, 198, 12, 69162, 49567, 92374, 39352, 198, 12, 41479, 96, 51827, 101136, 101884, 198, 12, 34369, 225, 110042, 198, 12, 18137, 44104, 21596, 32665, 118801, 271, 67331, 220, 19, 13, 19, 32778, 44054, 93437, 198, 12, 32778, 93920, 114, 77835, 33108, 101259, 98380, 198, 12, 18137, 243, 250, 65101, 52526, 116817, 198, 12, 7854, 12, 1552, 18137, 249, 228, 12857, 198, 12, 18137, 95, 226, 99259, 33108, 98841, 18158, 100674, 198, 12, 18137, 44104, 21596, 32665, 118801, 271, 67331, 220, 19, 13, 20, 7854, 12, 1552, 44054, 93437, 198, 12, 7854, 12, 1552, 93920, 114, 77835, 33108, 105151, 39352, 198, 12, 51461, 229, 61755, 26939, 106208, 100261, 99759, 198, 12, 8908, 71933, 103414, 105173, 100674, 198, 12, 220, 105537, 106637, 31548, 198, 12, 18137, 44104, 21596, 32665, 118801, 271, 820, 220, 20, 13, 51461, 116, 63109, 44956, 33108, 102011, 271, 67331, 220, 20, 13, 16, 51461, 116, 63109, 20074, 100166, 320, 2153, 53560, 12, 53289, 6567, 239, 246, 30534, 54542, 198, 12, 15819, 1731, 34369, 225, 27369, 39352, 198, 12, 45147, 1731, 69162, 49567, 92374, 27369, 198, 12, 13074, 6370, 220, 27369, 98671, 99658, 271, 67331, 220, 20, 13, 17, 393, 17, 47, 8908, 108, 225, 26381, 31548, 320, 2740, 5523, 48709, 2687, 15222, 53560, 12, 8908, 108, 225, 26381, 31548, 106379, 70500, 198, 12, 220, 57621, 102474, 104949, 198, 12, 32181, 252, 29077, 44091, 39352, 198, 12, 62262, 17177, 28291, 104238, 198, 12, 10236, 121, 239, 68065, 57621, 54542, 271, 67331, 220, 20, 13, 18, 38433, 236, 78882, 105653, 72448, 320, 2740, 70020, 53560, 12, 53497, 246, 99871, 111372, 33108, 107736, 70500, 198, 12, 328, 18, 38433, 236, 78882, 101884, 198, 12, 479, 6412, 38433, 236, 78882, 101884, 198, 12, 472, 62266, 38433, 236, 78882, 101884, 198, 12, 32112, 38433, 236, 78882, 101884, 198, 12, 53497, 246, 99871, 39352, 31548, 271, 67331, 220, 20, 13, 19, 6567, 234, 223, 99379, 32108, 29258, 41321, 72448, 320, 2740, 4322, 4975, 291, 44848, 53560, 12, 52506, 224, 64682, 88802, 39352, 198, 12, 51461, 229, 61755, 105173, 100674, 198, 12, 62262, 18397, 61443, 54542, 198, 12, 93178, 247, 29056, 54542, 33108, 29258, 41321, 104238, 271, 67331, 220, 20, 13, 20, 4891, 223, 98, 99446, 101071, 72448, 320, 2740, 14, 12120, 2028, 53560, 12, 4891, 223, 98, 99446, 101071, 106379, 198, 12, 89982, 27733, 33108, 107285, 101071, 198, 12, 32181, 229, 102980, 31548, 33108, 104814, 198, 12, 10236, 39366, 39352, 271, 67331, 220, 20, 13, 21, 53497, 246, 99871, 72448, 320, 2740, 31320, 53560, 12, 68739, 227, 36629, 100246, 40623, 105653, 198, 12, 84238, 241, 24360, 39352, 104238, 198, 12, 220, 52526, 104875, 105653, 198, 12, 34369, 225, 20074, 108418, 32108, 198, 12, 97259, 227, 21887, 33108, 101999, 271, 820, 220, 21, 13, 18137, 44104, 21596, 39352, 198, 12, 18137, 44104, 21596, 26898, 100166, 33108, 117206, 198, 12, 38433, 226, 110195, 85767, 118801, 198, 12, 10236, 236, 107, 99279, 105149, 85767, 198, 12, 18137, 44104, 21596, 48927, 33108, 47363, 25511, 198, 12, 54599, 101, 35243, 85767, 50007, 271, 820, 220, 22, 13, 18137, 225, 101, 100463, 33108, 113308, 271, 67331, 220, 22, 13, 16, 220, 104603, 100013, 99719, 198, 12, 6040, 18855, 85658, 105866, 198, 12, 40549, 1198, 2900, 18137, 225, 101, 100463, 198, 12, 81947, 28291, 102011, 33108, 110760, 271, 67331, 220, 22, 13, 17, 58263, 51232, 99719, 102121, 198, 12, 66374, 18137, 225, 101, 100463, 105866, 198, 12, 62042, 90867, 20742, 85767, 198, 12, 8908, 113, 226, 37984, 100367, 33108, 47872, 90172, 198, 12, 41479, 231, 35987, 85767, 33108, 102179, 100419, 271, 67331, 220, 22, 13, 18, 74866, 239, 99332, 33108, 113308, 198, 12, 6567, 234, 229, 30844, 104412, 33108, 104814, 198, 12, 75402, 77128, 108069, 101042, 198, 12, 4891, 223, 98, 99446, 101071, 33108, 57555, 99511, 198, 12, 90476, 100, 26232, 47872, 90172, 105866, 198, 12, 43614, 227, 99884, 105853, 108750, 271, 820, 220, 23, 13, 5333, 26853, 224, 77598, 111116, 198, 12, 40549, 32112, 5333, 34369, 120, 36629, 33071, 198, 12, 44054, 93437, 17881, 10130, 5333, 198, 12, 393, 17, 47, 66521, 237, 96422, 101931, 198, 12, 4891, 223, 98, 99446, 101071, 78882, 27442, 198, 12, 10236, 106, 94, 21887, 33108, 85767, 5333, 271, 820, 220, 24, 13, 62262, 104949, 33108, 105653, 198, 12, 51461, 116, 63109, 20074, 100166, 91282, 198, 12, 62262, 44956, 100144, 70500, 198, 12, 53497, 246, 99871, 68805, 101931, 198, 12, 62262, 113274, 33108, 71109, 39352, 271, 820, 220, 16, 15, 13, 41479, 231, 35987, 33071, 70500, 198, 12, 33424, 97, 33477, 33108, 102204, 100674, 198, 12, 41654, 18137, 44104, 21596, 33108, 104759, 39352, 198, 12, 10236, 121, 239, 68065, 99464, 33108, 104925, 100359, 198, 12, 62262, 111293, 100153, 198, 12, 41479, 231, 35987, 102179, 100419, 271, 820, 220, 16, 16, 13, 90476, 100, 26232, 33108, 106375, 33071, 198, 12, 90476, 100, 26232, 109371, 81705, 198, 12, 46750, 102, 76313, 33071, 101042, 198, 12, 41479, 117, 32757, 100367, 105866, 198, 12, 90476, 100, 26232, 47872, 90172, 101898, 198, 12, 8908, 112, 253, 27366, 81705, 39907, 271, 820, 220, 16, 17, 13, 18137, 249, 228, 12857, 33108, 106375, 198, 12, 38433, 236, 78882, 105653, 101255, 14224, 100013, 198, 12, 41479, 117, 31548, 104001, 13343, 102705, 198, 12, 74866, 239, 99332, 72448, 102705, 198, 12, 20694, 14, 6484, 98313, 223, 52510, 43268, 102705, 198, 12, 50331, 113699, 102011, 102705, 271, 820, 220, 16, 18, 13, 98313, 66635, 104238, 198, 12, 66521, 243, 23305, 81705, 105866, 198, 12, 18137, 249, 228, 12857, 81705, 102724, 198, 12, 90476, 100, 26232, 81705, 39907, 198, 12, 10236, 104, 107, 26939, 78882, 81705, 198, 12, 98313, 66635, 20074, 39352, 271, 820, 220, 16, 19, 13, 83002, 98, 76813, 33108, 102556, 74220, 198, 12, 26853, 107, 57452, 32108, 102011, 37029, 198, 12, 90476, 100, 26232, 101042, 102011, 198, 12, 62262, 113274, 102011, 198, 12, 18137, 44104, 21596, 48927, 102011, 198, 12, 43614, 227, 99884, 105262, 102011, 271, 820, 220, 16, 20, 13, 81947, 28291, 105866, 198, 12, 220, 46100, 102007, 105866, 198, 12, 81947, 28291, 99719, 104870, 198, 12, 220, 46100, 101931, 33108, 102179, 100419, 198, 12, 8908, 108, 225, 41321, 33108, 81705, 39907, 198, 12, 69425, 51827, 102054, 271, 820, 220, 16, 21, 13, 43614, 227, 99884, 105853, 33108, 101536, 86119, 198, 12, 4891, 116, 116, 88970, 86119, 106185, 198, 12, 43614, 227, 99884, 105853, 102054, 198, 12, 93178, 247, 29056, 46100, 101275, 198, 12, 90476, 100, 26232, 86119, 105262, 198, 12, 10236, 121, 239, 68065, 86119, 105853, 271, 820, 220, 16, 22, 13, 64388, 230, 21894, 100022, 33108, 113274, 198, 12, 64388, 230, 21894, 90447, 66394, 198, 12, 38433, 239, 33447, 114288, 33071, 198, 12, 32181, 223, 59534, 105866, 198, 12, 52506, 225, 11622, 98380, 198, 12, 66521, 229, 52334, 101898, 271, 565, 220, 16, 19, 13, 6567, 118, 238, 46100, 9909, 105537, 26898, 7552, 102802, 271, 14374, 220, 73345, 99706, 99778, 198, 12, 508, 54675, 21324, 9533, 54675, 21324, 340, 12, 508, 8078, 1192, 9533, 8078, 1192, 692, 14374, 51461, 116, 63109, 110195, 271, 820, 20713, 44054, 93437, 198, 12, 508, 8092, 15351, 18002, 9533, 8092, 15351, 18002, 340, 12, 508, 8092, 83033, 83033, 18002, 9533, 8092, 83033, 83033, 18002, 340, 12, 508, 8092, 14, 8092, 4030, 37255, 18002, 9533, 8092, 14, 8092, 4030, 37255, 18002, 340, 12, 508, 8092, 14, 8092, 2972, 25085, 18002, 9533, 8092, 14, 8092, 2972, 25085, 18002, 692, 820, 17116, 44054, 93437, 2303, 12, 508, 8611, 15351, 18002, 9533, 8611, 15351, 18002, 340, 12, 508, 8611, 83033, 83033, 18002, 9533, 8611, 83033, 83033, 18002, 340, 12, 508, 8611, 34827, 4030, 37255, 18002, 9533, 8611, 34827, 4030, 37255, 18002, 340, 12, 508, 8611, 34827, 2972, 25085, 18002, 9533, 8611, 34827, 2972, 25085, 18002, 692, 820, 40179, 44054, 93437, 198, 12, 508, 50395, 15351, 18002, 9533, 50395, 15351, 18002, 340, 12, 508, 50395, 83033, 83033, 18002, 9533, 50395, 83033, 83033, 18002, 340, 12, 508, 50395, 14, 13131, 388, 2836, 37255, 18002, 9533, 50395, 14, 13131, 388, 2836, 37255, 18002, 340, 12, 508, 50395, 14, 65512, 2972, 25085, 18002, 9533, 50395, 14, 65512, 2972, 25085, 18002, 692, 820, 32778, 44054, 93437, 198, 12, 508, 22803, 15351, 18002, 9533, 22803, 15351, 18002, 340, 12, 508, 22803, 83033, 83033, 18002, 9533, 22803, 83033, 83033, 18002, 340, 12, 508, 22803, 18008, 4130, 4030, 37255, 18002, 9533, 22803, 18008, 4130, 4030, 37255, 18002, 692, 820, 7854, 12, 1552, 44054, 93437, 198, 12, 508, 5834, 21492, 15351, 18002, 9533, 5834, 21492, 15351, 18002, 340, 12, 508, 5834, 21492, 83033, 83033, 18002, 9533, 5834, 21492, 83033, 83033, 18002, 340, 12, 508, 5834, 21492, 76196, 4030, 37255, 18002, 9533, 5834, 21492, 76196, 4030, 37255, 18002, 340, 12, 508, 5834, 21492, 76196, 4314, 31320, 18002, 9533, 5834, 21492, 76196, 4314, 31320, 18002, 692, 14374, 51461, 116, 63109, 20074, 100166, 198, 12, 508, 2153, 3446, 15153, 18002, 9533, 2153, 3446, 15153, 18002, 340, 12, 508, 2153, 90228, 466, 824, 18002, 9533, 2153, 90228, 466, 824, 18002, 340, 12, 508, 2153, 14, 16537, 3109, 18002, 9533, 2153, 14, 16537, 3109, 18002, 340, 12, 508, 2153, 54976, 8296, 18002, 9533, 2153, 54976, 8296, 18002, 340, 12, 508, 2153, 14, 16537, 8467, 18002, 9533, 2153, 14, 16537, 8467, 18002, 692, 14374, 393, 17, 47, 8908, 108, 225, 26381, 31548, 198, 12, 508, 2740, 5523, 48709, 2687, 15222, 2687, 15222, 18002, 9533, 2740, 5523, 48709, 2687, 15222, 2687, 15222, 18002, 340, 12, 508, 2740, 5523, 48709, 2687, 15222, 42764, 18002, 9533, 2740, 5523, 48709, 2687, 15222, 42764, 18002, 340, 12, 508, 2740, 5523, 48709, 2687, 15222, 63796, 18002, 9533, 2740, 5523, 48709, 2687, 15222, 63796, 18002, 340, 12, 508, 2740, 5523, 48709, 2687, 15222, 41510, 3400, 41510, 3400, 261, 18002, 9533, 2740, 5523, 48709, 2687, 15222, 41510, 3400, 41510, 3400, 261, 18002, 340, 12, 508, 2740, 5523, 48709, 2687, 15222, 14, 5148, 14, 5148, 18002, 9533, 2740, 5523, 48709, 2687, 15222, 14, 5148, 14, 5148, 18002, 692, 14374, 38433, 236, 78882, 105653, 72448, 198, 12, 508, 2740, 70020, 14, 13297, 18002, 9533, 2740, 70020, 14, 13297, 18002, 340, 12, 508, 2740, 70020, 14730, 18002, 9533, 2740, 70020, 14730, 18002, 340, 12, 508, 2740, 70020, 25085, 18002, 9533, 2740, 70020, 25085, 18002, 340, 12, 508, 2740, 70020, 2687, 18, 20942, 25085, 18002, 9533, 2740, 70020, 2687, 18, 20942, 25085, 18002, 340, 12, 508, 2740, 70020, 4846, 4837, 20942, 25085, 18002, 9533, 2740, 70020, 4846, 4837, 20942, 25085, 18002, 340, 12, 508, 2740, 70020, 7530, 34378, 20942, 25085, 18002, 9533, 2740, 70020, 7530, 34378, 20942, 25085, 18002, 340, 12, 508, 2740, 70020, 14, 29172, 20942, 34827, 2972, 18002, 9533, 2740, 70020, 14, 29172, 20942, 34827, 2972, 18002, 692, 14374, 53497, 246, 99871, 72448, 198, 12, 508, 2740, 31320, 80591, 14809, 18002, 9533, 2740, 31320, 80591, 14809, 18002, 340, 12, 508, 2740, 31320, 62094, 14809, 18002, 9533, 2740, 31320, 62094, 14809, 18002, 340, 12, 508, 2740, 31320, 37173, 14809, 18002, 9533, 2740, 31320, 37173, 14809, 18002, 340, 12, 508, 2740, 31320, 26090, 23903, 14809, 18002, 9533, 2740, 31320, 26090, 23903, 14809, 18002, 340, 12, 508, 2740, 31320, 3183, 7603, 3183, 7603, 18002, 9533, 2740, 31320, 3183, 7603, 3183, 7603, 18002, 692, 14374, 6567, 234, 223, 99379, 32108, 29258, 41321, 72448, 198, 12, 508, 2740, 4322, 4975, 291, 44848, 14, 13297, 18002, 9533, 2740, 4322, 4975, 291, 44848, 14, 13297, 18002, 340, 12, 508, 2740, 4322, 4975, 291, 44848, 76196, 9995, 1693, 14, 80787, 18002, 9533, 2740, 4322, 4975, 291, 44848, 76196, 9995, 1693, 14, 80787, 18002, 340, 12, 508, 2740, 4322, 4975, 291, 44848, 64264, 1419, 14, 80787, 18002, 9533, 2740, 4322, 4975, 291, 44848, 64264, 1419, 14, 80787, 18002, 692, 14374, 4891, 223, 98, 99446, 101071, 72448, 198, 12, 508, 2740, 14, 12120, 2028, 14, 32225, 18002, 9533, 2740, 14, 12120, 2028, 14, 32225, 18002, 340, 12, 508, 2740, 14, 12120, 2028, 46619, 261, 18002, 9533, 2740, 14, 12120, 2028, 46619, 261, 18002, 340, 12, 508, 2740, 14, 12120, 2028, 63524, 18002, 9533, 2740, 14, 12120, 2028, 63524, 18002, 692, 14374, 10236, 121, 239, 68065, 57621, 54542, 198, 12, 508, 2740, 5523, 48709, 38065, 49710, 440, 684, 14, 58912, 18002, 9533, 2740, 5523, 48709, 38065, 49710, 440, 684, 14, 58912, 18002, 340, 12, 508, 2740, 5523, 48709, 38065, 49710, 440, 684, 42764, 18002, 9533, 2740, 5523, 48709, 38065, 49710, 440, 684, 42764, 18002, 692, 14374, 18137, 44104, 21596, 39352, 198, 12, 508, 1676, 14, 8092, 26090, 33406, 9533, 1676, 14, 8092, 26090, 33406, 340, 12, 508, 1676, 14, 8611, 26090, 33406, 9533, 1676, 14, 8611, 26090, 33406, 340, 12, 508, 1676, 21485, 9683, 26090, 33406, 9533, 1676, 21485, 9683, 26090, 33406, 340, 12, 508, 1676, 18008, 4130, 26090, 33406, 9533, 1676, 18008, 4130, 26090, 33406, 340, 12, 508, 1676, 30593, 21492, 26090, 33406, 9533, 1676, 30593, 21492, 26090, 33406, 692, 14374, 18137, 225, 101, 100463, 33108, 113308, 198, 12, 508, 28648, 14, 8092, 14953, 13659, 1192, 9533, 28648, 14, 8092, 14953, 13659, 1192, 340, 12, 508, 28648, 14, 8611, 14953, 13659, 1192, 9533, 28648, 14, 8611, 14953, 13659, 1192, 340, 12, 508, 28648, 21485, 9683, 14953, 13659, 1192, 9533, 28648, 21485, 9683, 14953, 13659, 1192, 340, 12, 508, 51899, 14, 14488, 33406, 9533, 51899, 14, 14488, 33406, 340, 12, 508, 51899, 96985, 33406, 9533, 51899, 96985, 33406, 340, 12, 508, 51668, 35061, 18855, 14, 54675, 21324, 9533, 51668, 35061, 18855, 14, 54675, 21324, 340, 12, 508, 51668, 14109, 23, 82, 14, 54675, 21324, 9533, 51668, 14109, 23, 82, 14, 54675, 21324, 692, 14374, 83002, 98, 76813, 33108, 102556, 74220, 198, 12, 508, 15918, 8749, 14, 88981, 15351, 18002, 9533, 15918, 8749, 14, 88981, 15351, 18002, 340, 12, 508, 15918, 8749, 4322, 617, 261, 15351, 18002, 9533, 15918, 8749, 4322, 617, 261, 15351, 18002, 340, 12, 508, 15918, 8749, 10758, 1078, 15351, 18002, 9533, 15918, 8749, 10758, 1078, 15351, 18002, 692, 14374, 98313, 66635, 198, 12, 508, 1944, 23266, 31236, 723, 477, 7197, 9533, 1944, 23266, 31236, 723, 477, 7197, 340, 12, 508, 1944, 23266, 12697, 15467, 7197, 9533, 1944, 23266, 12697, 15467, 7197, 340, 12, 508, 1944, 23266, 12697, 814, 13659, 7197, 9533, 1944, 23266, 12697, 814, 13659, 7197, 692, 14374, 220, 105600, 102011, 44956, 198, 12, 508, 6031, 14730, 1314, 14730, 18002, 9533, 6031, 14730, 1314, 14730, 18002, 340, 12, 508, 6031, 14, 96336, 628, 321, 14, 96336, 628, 321, 18002, 9533, 6031, 14, 96336, 628, 321, 14, 96336, 628, 321, 18002, 340, 12, 508, 6031, 19413, 19413, 18002, 9533, 6031, 19413, 19413, 18002, 340, 12, 508, 6031, 38065, 1314, 38065, 1314, 18002, 9533, 6031, 38065, 1314, 38065, 1314, 18002, 692, 99487, 111116, 106379, 114369, 27612, 16447, 3366, 220, 73345, 105679, 100185, 98380, 5373, 99361, 104449, 33108, 37029, 102122, 3837, 17714, 99604, 100920, 9370, 20002, 104257, 105896, 112872, 1773, 111116, 100166, 99929, 100662, 34187, 104913, 104542, 33071, 3837, 99518, 103944, 34187, 99361, 102217, 3837, 100006, 101929, 45181, 84607, 102569, 26939, 104112, 100013, 106017, 99604, 100354, 8997, 522, 35499, 42682, 1339, 27, 14172, 13429, 287, 397, 2610, 614, 7375, 518, 697, 33445, 311, 11625, 279, 10822, 3383, 13, 11112, 1493, 5601, 8826, 5392, 6738, 510, 16, 13, 67414, 1795, 279, 5392, 1618, 10802, 6896, 438, 5189, 323, 1281, 2704, 311, 3410, 678, 5871, 5029, 624, 17, 13, 576, 10435, 1231, 5785, 7375, 429, 525, 902, 5021, 2500, 13, 55025, 1618, 7375, 429, 525, 537, 20975, 3897, 624, 18, 13, 3155, 537, 990, 1140, 4334, 18822, 311, 6851, 6220, 476, 1034, 1995, 429, 374, 2669, 3897, 304, 279, 2390, 5944, 624, 522, 14172, 13429, 287, 1339, 7771, 5795, 374, 311, 9245, 264, 2390, 18906, 9705, 12626, 14257, 504, 15817, 6358, 315, 279, 2038, 3152, 11, 61945, 11, 323, 12613, 7236, 13, 576, 5944, 1265, 8683, 438, 279, 16266, 369, 264, 9705, 3910, 11, 53829, 311, 2176, 46850, 323, 10321, 13402, 19178, 382, 36850, 7354, 510, 16, 13, 28596, 10816, 510, 256, 7854, 264, 7299, 11591, 9705, 28922, 429, 25963, 279, 2390, 594, 3692, 7321, 198, 17, 13, 75938, 59170, 510, 256, 7405, 2704, 429, 23759, 17189, 448, 678, 10007, 9705, 8502, 3685, 198, 18, 13, 9258, 23470, 510, 256, 15042, 1590, 3059, 304, 279, 2567, 4718, 3561, 271, 8420, 525, 279, 8502, 369, 279, 9705, 5944, 9471, 510, 16, 13, 12260, 1172, 2924, 14158, 429, 7866, 311, 5042, 2038, 3152, 6813, 11, 3516, 11, 323, 11537, 4419, 304, 279, 2390, 624, 17, 13, 28596, 7321, 1265, 1795, 2390, 594, 19819, 6396, 323, 10306, 42463, 6193, 287, 624, 18, 13, 45945, 2449, 1969, 2432, 2390, 594, 2038, 3152, 34918, 323, 990, 12966, 34948, 44493, 624, 19, 13, 29734, 2009, 5333, 9705, 311, 3421, 678, 584, 24099, 323, 2924, 14887, 28703, 624, 20, 13, 758, 279, 2213, 11, 6832, 32724, 1265, 1191, 448, 6770, 18940, 11, 1221, 5098, 311, 10847, 13347, 624, 21, 13, 30846, 1550, 11591, 916, 5072, 448, 11682, 5785, 9705, 624, 22, 13, 29734, 14158, 369, 3709, 35812, 8474, 11, 13713, 11221, 323, 6770, 10431, 10295, 624, 23, 13, 39565, 12235, 14158, 369, 1817, 4565, 11, 1186, 91840, 11, 4625, 10802, 11, 5333, 323, 2473, 624, 24, 13, 8883, 10191, 1969, 2924, 678, 4583, 4419, 323, 1186, 91840, 7289, 624, 16, 15, 13, 21159, 2213, 1741, 438, 68671, 27193, 323, 6203, 11591, 10431, 1265, 387, 5230, 304, 8311, 1992, 624, 16, 16, 13, 9177, 6546, 11, 48041, 11, 323, 8894, 3501, 624, 16, 17, 13, 17207, 3684, 1265, 387, 304, 19819, 7321, 11, 4135, 42156, 12624, 624, 16, 18, 13, 1752, 1817, 3772, 11, 10542, 323, 2924, 279, 1429, 9760, 2530, 3542, 504, 279, 2390, 438, 17749, 2458, 10695, 624, 16, 19, 13, 1416, 279, 2390, 374, 1602, 4285, 11, 279, 2197, 5944, 1265, 387, 438, 4285, 438, 3204, 624, 16, 20, 13, 1416, 902, 2697, 3542, 3000, 320, 68, 1302, 2572, 678, 525, 5335, 11, 7868, 3542, 11, 4992, 24389, 498, 1265, 537, 6923, 894, 9293, 624, 16, 21, 13, 576, 1482, 2038, 3152, 702, 220, 2697, 3542, 11, 421, 279, 1372, 315, 2697, 3542, 374, 2686, 1091, 220, 20, 15, 11, 279, 2197, 5944, 646, 1172, 614, 825, 2188, 11, 323, 1172, 264, 9814, 16800, 311, 279, 3542, 374, 3897, 198, 16, 22, 13, 4320, 944, 3331, 1182, 13, 20678, 432, 697, 678, 382, 5097, 15042, 510, 785, 1590, 2550, 1265, 387, 264, 4718, 5944, 14064, 279, 9705, 28922, 13, 5443, 279, 2701, 3561, 1447, 334, 8973, 46817, 30990, 52225, 27972, 28763, 25, 1019, 16, 13, 1446, 27732, 1191, 697, 2033, 448, 366, 76303, 38283, 29, 320, 67685, 4772, 340, 17, 13, 1446, 27732, 835, 697, 2033, 448, 690, 76303, 38283, 29, 320, 85777, 4772, 340, 18, 13, 3155, 4183, 990, 894, 1008, 3561, 476, 9492, 198, 19, 13, 576, 4583, 6358, 1265, 387, 19472, 1948, 366, 76303, 38283, 29, 323, 690, 76303, 38283, 29, 9492, 198, 20, 13, 576, 9934, 27732, 387, 438, 11682, 438, 3204, 11, 37838, 1128, 2213, 3880, 311, 387, 5230, 323, 11682, 18821, 389, 279, 5944, 315, 279, 2213, 271, 334, 5370, 28596, 24580, 25, 1019, 12, 2265, 25, 3070, 8164, 334, 3772, 12, 15909, 198, 12, 829, 25, 3070, 8164, 334, 11113, 829, 198, 12, 17749, 2458, 25, 3070, 8164, 334, 1759, 315, 17749, 3542, 369, 12453, 1995, 11, 1084, 374, 264, 8674, 1034, 1815, 448, 5091, 311, 279, 12542, 3704, 6220, 198, 12, 9934, 25, 3070, 8164, 1019, 262, 481, 1752, 279, 12126, 3772, 476, 3704, 6193, 25, 4230, 15817, 2213, 369, 419, 3772, 10735, 389, 508, 56481, 33635, 39892, 70303, 14, 71913, 936, 81917, 1181, 7428, 11, 17646, 11, 323, 5025, 311, 1008, 6813, 13, 11789, 279, 8129, 3565, 11, 6546, 2606, 11, 323, 10431, 12624, 13, 29734, 2176, 43801, 916, 5072, 369, 46850, 323, 10916, 3565, 369, 10321, 13402, 13, 5443, 56626, 12966, 448, 279, 2038, 3152, 13, 39565, 14976, 10295, 44196, 4185, 990, 5048, 13, 11789, 584, 24099, 11, 5029, 11, 323, 470, 2750, 13, 29734, 46187, 1380, 8311, 311, 40368, 1376, 18940, 10346, 262, 481, 1752, 4565, 3565, 25, 7843, 11682, 2213, 369, 419, 1186, 91840, 476, 1186, 41387, 3772, 13, 663, 14438, 398, 10339, 8129, 3565, 11, 28696, 5025, 11, 24099, 11, 7947, 1614, 323, 10431, 12624, 13, 29734, 14175, 10295, 504, 279, 5042, 2038, 3152, 13, 11789, 6546, 2606, 11, 5029, 11, 323, 470, 2750, 13, 81917, 11871, 448, 1008, 6813, 13, 9177, 4185, 4714, 323, 862, 9904, 13, 7405, 2213, 15614, 311, 46850, 1393, 8241, 14016, 10916, 7990, 369, 10321, 13402, 624, 262, 481, 1752, 5333, 2197, 25, 4230, 5333, 9705, 369, 508, 56481, 33635, 5333, 11176, 21531, 15792, 11838, 936, 1752, 25414, 1262, 33356, 11, 2197, 10130, 5413, 11, 5548, 12624, 11, 1681, 98804, 61800, 11, 323, 16653, 5413, 13, 1752, 47042, 33356, 11, 2197, 3633, 11589, 11, 1943, 19856, 11, 1538, 4494, 11, 323, 1931, 7246, 16230, 12624, 13, 1752, 20954, 33356, 11, 2197, 3633, 31785, 11, 821, 14087, 11, 7868, 19856, 11, 323, 1584, 6240, 13, 1752, 45833, 16341, 3444, 10535, 11, 2197, 821, 16842, 11, 1943, 12299, 11, 323, 1882, 57912, 13, 29734, 11507, 18906, 10295, 11, 1465, 11589, 14830, 11, 4763, 37764, 11, 4379, 32894, 11, 323, 2319, 287, 1995, 13, 11789, 4185, 990, 5048, 11, 2943, 8129, 17501, 11, 323, 5068, 25262, 10414, 13, 9177, 11507, 18906, 27703, 7375, 323, 16558, 19827, 13, 1416, 8415, 11, 3410, 11906, 27193, 369, 31590, 4419, 323, 28412, 24748, 8388, 624, 262, 481, 1752, 42463, 9705, 25, 4230, 42463, 9705, 369, 508, 56481, 33635, 70303, 936, 60785, 279, 1550, 11591, 2884, 11, 42463, 12624, 11, 323, 1849, 22711, 13, 11789, 3692, 21880, 11, 821, 27455, 11, 323, 17590, 12624, 13, 81917, 10916, 11181, 11, 6559, 63939, 11, 323, 16982, 13, 29734, 13737, 8502, 11, 93740, 37764, 11, 323, 23172, 44882, 13, 39565, 1849, 2266, 46187, 323, 3692, 29985, 82, 13, 9177, 5312, 42221, 1280, 10520, 1075, 4763, 11, 16558, 11, 323, 20763, 13351, 13, 11789, 5440, 5611, 11, 4843, 24031, 19543, 11, 323, 2319, 24748, 624, 262, 481, 1752, 821, 1614, 25, 4230, 15817, 821, 1614, 9705, 369, 508, 56481, 33635, 47970, 14, 3540, 35839, 936, 25771, 5387, 11871, 11, 2070, 17473, 11, 323, 821, 4494, 13, 11789, 6028, 14, 28443, 6894, 11, 24953, 11, 323, 16982, 13, 81917, 821, 10519, 5601, 323, 2562, 5601, 13, 29734, 4625, 10802, 46187, 323, 6077, 821, 13, 11789, 821, 2615, 12624, 11, 47430, 14830, 11, 323, 5068, 37764, 13, 47395, 821, 47508, 11, 37131, 10186, 11, 323, 93947, 5601, 13, 29734, 821, 11906, 12716, 323, 2319, 6240, 13, 9177, 821, 4763, 11, 12345, 8502, 11, 323, 2615, 2524, 624, 262, 481, 1752, 3689, 6813, 25, 4230, 11682, 9705, 369, 508, 56481, 33635, 3689, 70303, 936, 60785, 279, 3692, 594, 9124, 11094, 11, 7709, 11, 323, 1196, 16230, 12624, 13, 11789, 6914, 14, 12340, 11, 4357, 11, 15711, 11, 323, 48041, 2606, 13, 29734, 10431, 10295, 448, 2038, 68642, 323, 3887, 67253, 13, 39565, 17501, 369, 25988, 2884, 323, 39700, 8733, 13, 11789, 3692, 5302, 11, 26053, 11, 323, 33592, 13, 29734, 1707, 48041, 2606, 323, 1105, 287, 1824, 13, 9177, 5312, 31555, 24748, 323, 5068, 25262, 13, 11789, 3692, 18037, 12624, 323, 17590, 448, 1008, 3689, 5424, 624, 12, 2841, 26564, 25, 3070, 8164, 334, 21149, 911, 1246, 311, 1186, 59394, 279, 1790, 11591, 2197, 5944, 311, 2841, 476, 902, 4623, 44373, 374, 4362, 13, 8278, 6718, 1119, 1186, 21599, 82, 979, 279, 2213, 374, 6351, 26, 421, 279, 2213, 374, 1602, 4285, 11, 5648, 26541, 44373, 624, 262, 481, 3070, 13314, 25, 1019, 286, 481, 576, 4419, 22903, 525, 2238, 6351, 369, 825, 2197, 311, 3421, 678, 279, 6540, 11, 323, 1184, 311, 387, 6718, 3772, 16, 11, 3772, 17, 1119, 279, 2841, 3772, 198, 286, 481, 576, 4688, 5610, 5248, 4419, 1741, 438, 4565, 16, 11, 4565, 17, 11, 4992, 2572, 323, 3880, 311, 387, 7481, 304, 7716, 304, 279, 22848, 82, 624, 286, 481, 576, 4565, 374, 2238, 4285, 13, 576, 1482, 2197, 374, 14016, 311, 3421, 279, 16800, 11, 902, 1184, 311, 6718, 624, 286, 481, 576, 2197, 374, 458, 23251, 943, 323, 1265, 387, 63594, 11, 902, 4623, 44373, 374, 4362, 624, 286, 481, 576, 2681, 2197, 374, 537, 264, 2477, 12, 48482, 943, 1741, 438, 458, 23251, 476, 1273, 11, 323, 5610, 5248, 6351, 1186, 77430, 26, 279, 1482, 2197, 1172, 5707, 264, 4586, 23251, 315, 1493, 1186, 77430, 323, 7460, 4623, 59822, 323, 11682, 16148, 624, 286, 481, 472, 33880, 374, 2238, 5538, 311, 387, 4623, 66509, 4490, 624, 286, 481, 2308, 4623, 44373, 374, 4362, 624, 286, 481, 24369, 9293, 323, 862, 1186, 11527, 2831, 11, 1741, 438, 23251, 11, 7497, 11, 23172, 11, 87577, 11, 323, 6200, 18940, 11, 1265, 387, 7481, 2878, 220, 16, 311, 220, 17, 5866, 11, 2041, 26541, 44373, 624, 286, 481, 24515, 3855, 11, 5440, 5611, 11, 28720, 3880, 311, 387, 438, 4285, 438, 3204, 11, 11523, 304, 264, 3175, 2197, 11, 2041, 44373, 1119, 22848, 82, 198, 12, 702, 31206, 25, 3070, 8164, 334, 4636, 2841, 3119, 11, 498, 27732, 1795, 279, 2841, 3119, 8502, 311, 6718, 279, 2197, 1186, 7837, 1119, 2841, 382, 334, 5370, 28596, 33784, 25, 1019, 12, 3070, 74909, 45234, 20395, 95518, 5443, 24034, 2841, 369, 19819, 1874, 819, 198, 12, 3070, 1092, 52899, 55669, 95518, 29734, 1449, 5089, 12893, 2041, 83118, 198, 12, 3070, 1703, 38446, 95518, 5948, 1817, 3772, 311, 9760, 2530, 3542, 369, 13403, 198, 12, 3070, 57468, 4874, 17582, 12754, 95518, 8886, 9934, 1969, 387, 11682, 11, 1917, 35085, 11, 323, 31554, 3151, 198, 12, 3070, 37889, 4874, 28596, 95518, 86377, 220, 16, 12, 19, 5866, 315, 7990, 7192, 369, 1429, 9705, 271, 334, 9620, 59501, 52517, 25, 1019, 4854, 3772, 9934, 1969, 2924, 510, 12, 3070, 47514, 5578, 25806, 95518, 12023, 21892, 315, 1128, 12893, 374, 1660, 26372, 198, 12, 3070, 36850, 12309, 95518, 26668, 7990, 8311, 369, 279, 3692, 23094, 198, 12, 3070, 3533, 36019, 25311, 95518, 36532, 2038, 10295, 323, 10431, 12624, 504, 279, 5042, 2038, 3152, 198, 12, 3070, 52464, 9608, 95518, 2585, 419, 3692, 35616, 311, 3800, 304, 279, 1849, 198, 12, 3070, 90535, 65, 50240, 81561, 95518, 7718, 4714, 323, 862, 9904, 198, 12, 3070, 34791, 21144, 804, 95518, 57739, 10414, 323, 1850, 12378, 1380, 9760, 271, 334, 9620, 15044, 9680, 52517, 25, 1019, 12, 21144, 3425, 323, 1246, 311, 6718, 279, 1790, 2188, 315, 3772, 198, 12, 1416, 279, 1482, 3772, 5610, 5248, 1186, 98516, 4419, 11, 279, 1790, 2188, 3772, 1265, 387, 6718, 705, 198, 12, 421, 432, 374, 2669, 279, 24632, 3692, 476, 4565, 11, 1513, 944, 6718, 432, 624, 12, 70615, 369, 264, 8172, 1948, 2197, 7990, 323, 10306, 2897, 198, 12, 34006, 26541, 66710, 429, 3643, 10646, 5000, 198, 12, 5787, 23251, 11, 16800, 11, 5440, 5611, 11, 3709, 3855, 11, 7497, 11, 23172, 11, 28720, 323, 862, 1186, 21599, 82, 1265, 537, 1373, 2841, 14158, 438, 1753, 438, 3204, 198, 12, 24515, 3855, 11, 5440, 5611, 3880, 311, 387, 438, 4285, 438, 3204, 11, 11523, 304, 264, 3175, 2197, 11, 2041, 44373, 1119, 22848, 82, 271, 334, 9620, 15044, 38297, 25, 1019, 12, 10548, 311, 279, 2841, 3119, 2213, 11, 421, 1052, 374, 902, 1184, 311, 17576, 1495, 279, 30106, 2188, 11, 738, 702, 31206, 311, 364, 14004, 6967, 5937, 11, 738, 702, 31206, 311, 364, 8996, 3876, 31115, 697, 2033, 1667, 419, 4734, 4718, 5944, 11, 448, 902, 5107, 1467, 1573, 476, 1283, 510, 27, 76303, 38283, 397, 515, 220, 330, 3615, 788, 2278, 262, 341, 414, 330, 2102, 788, 330, 2809, 12, 15909, 756, 414, 330, 606, 788, 330, 9620, 3988, 756, 414, 330, 37555, 2458, 788, 4383, 20432, 2343, 32429, 10758, 8367, 23903, 16, 9220, 497, 330, 20432, 2343, 32429, 10758, 8367, 23903, 17, 9220, 8097, 414, 330, 40581, 788, 330, 4021, 15817, 2213, 2503, 476, 7843, 11682, 2213, 369, 2503, 756, 414, 330, 5864, 26564, 788, 330, 30438, 414, 330, 4648, 31206, 788, 330, 14004, 476, 5664, 698, 262, 456, 220, 5133, 532, 522, 76303, 38283, 1339, 60424, 697, 1590, 12542, 5944, 2213, 2878, 366, 76303, 38283, 29, 9492, 13, 29734, 902, 40841, 476, 6042, 4889, 315, 1493, 9492, 382, 334, 98743, 25, 1446, 27732, 5889, 304, 8453, 7, 104811, 8, 659, 334, 151645, 198, 151644, 872, 271, 8387, 2197, 5944, 4916, 510, 27, 3765, 26231, 38283, 397, 23888, 198, 144663, 16991, 220, 73345, 99706, 99778, 198, 144663, 16991, 88940, 104, 94299, 55286, 198, 144663, 16991, 93920, 114, 77835, 70500, 198, 144663, 16991, 51461, 116, 63109, 110195, 198, 144663, 16991, 34369, 109, 71743, 44956, 72448, 198, 144663, 16991, 18137, 44104, 21596, 39352, 198, 144663, 16991, 18137, 225, 101, 100463, 113308, 198, 144663, 16991, 5333, 26853, 224, 77598, 320, 5405, 2197, 340, 144663, 16991, 62262, 104949, 198, 144663, 16991, 41479, 231, 35987, 70500, 198, 144663, 16991, 90476, 100, 26232, 57218, 106375, 33071, 198, 144663, 16991, 18137, 249, 228, 12857, 57218, 106375, 198, 144663, 16991, 98313, 66635, 104238, 198, 144663, 16991, 83002, 98, 76813, 57218, 102556, 74220, 198, 144663, 16991, 43614, 227, 99884, 105853, 57218, 101536, 86119, 198, 144798, 16991, 81947, 28291, 105866, 198, 522, 3765, 26231, 38283, 1339, 27, 4310, 478, 397, 12, 24369, 9293, 323, 862, 1186, 11527, 2831, 11, 1741, 438, 23251, 11, 7497, 11, 23172, 11, 87577, 11, 323, 6200, 18940, 11, 1265, 387, 7481, 2878, 220, 16, 311, 220, 17, 5866, 11, 2041, 26541, 44373, 624, 12, 24515, 3855, 11, 5440, 5611, 11, 28720, 3880, 311, 387, 438, 4285, 438, 3204, 11, 11523, 304, 264, 3175, 2197, 11, 2041, 44373, 1119, 22848, 82, 198, 522, 4310, 478, 1339, 5097, 1172, 279, 1790, 2188, 2197, 5944, 315, 279, 2701, 2197, 510, 27, 3231, 26231, 397, 13608, 2102, 1210, 364, 2068, 72623, 516, 364, 606, 1210, 364, 7082, 26853, 224, 77598, 516, 364, 37555, 2458, 1210, 2509, 8092, 14, 8092, 4030, 37255, 18002, 516, 364, 8611, 34827, 4030, 37255, 18002, 516, 364, 50395, 14, 13131, 388, 2836, 37255, 18002, 516, 364, 22803, 18008, 4130, 4030, 37255, 18002, 516, 364, 5834, 21492, 76196, 4030, 37255, 18002, 516, 364, 15110, 4322, 17, 79, 4322, 17, 79, 57322, 4089, 364, 40581, 1210, 364, 50377, 16447, 3366, 10236, 36097, 54658, 9370, 100873, 5333, 26853, 224, 77598, 111116, 1773, 100700, 65577, 40549, 32112, 5333, 34369, 120, 36629, 33071, 101884, 3837, 100630, 100811, 65101, 83751, 72225, 5373, 105713, 39352, 5373, 35112, 6567, 36548, 9370, 10130, 10236, 104, 107, 27442, 1773, 100157, 99200, 110195, 106588, 101979, 10130, 5333, 3837, 100630, 20713, 5373, 13298, 5373, 31133, 5373, 16219, 5373, 11066, 12, 1552, 220, 104186, 104516, 107736, 1773, 66833, 41932, 393, 17, 47, 66521, 237, 96422, 101931, 3837, 104210, 18433, 4322, 17, 79, 4322, 17, 79, 57322, 41479, 248, 64559, 105778, 68805, 5373, 64064, 101136, 5373, 118376, 102054, 33108, 20074, 107468, 100674, 1773, 65577, 99722, 101071, 78882, 27442, 3837, 100630, 105266, 33071, 101071, 5373, 80158, 100829, 33071, 101071, 33108, 104118, 107047, 107736, 1773, 99553, 108069, 85767, 5333, 3837, 100630, 104299, 85767, 50007, 5373, 44091, 51154, 5373, 113308, 40090, 107736, 1773, 32664, 103991, 5333, 10236, 104, 107, 27442, 99553, 100700, 66394, 5122, 9230, 81454, 5373, 3144, 6567, 44401, 28330, 5373, 34859, 14, 102808, 100144, 5373, 104510, 39907, 5373, 32100, 46100, 33108, 19793, 26355, 1773, 100630, 5333, 64388, 230, 21894, 39352, 5373, 69041, 33447, 114288, 33071, 5373, 47149, 88653, 104238, 33108, 99464, 101118, 1773, 99553, 104621, 101884, 105866, 5373, 31534, 85658, 19793, 26355, 33108, 102179, 100419, 1773, 516, 364, 5864, 26564, 1210, 364, 7082, 101275, 102298, 101312, 99604, 109963, 7082, 107736, 3837, 73157, 86402, 101103, 101970, 101136, 33108, 105795, 3837, 85106, 101348, 102239, 5122, 35, 13659, 32112, 5333, 5373, 110195, 17881, 9230, 5333, 5373, 47, 17, 47, 101136, 107736, 5373, 99722, 101071, 57218, 39352, 7082, 49567, 44729, 113066, 71817, 100700, 66394, 1773, 516, 364, 4648, 31206, 1210, 364, 14004, 16418, 522, 3231, 26231, 1339, 334, 98743, 25, 1019, 12, 3070, 37889, 4874, 28596, 95518, 9125, 5944, 2188, 220, 17, 11, 10306, 220, 16, 12, 19, 5866, 315, 7990, 7192, 369, 1429, 9705, 198, 12, 1416, 279, 2841, 5944, 2188, 374, 2238, 5538, 11, 902, 4623, 44373, 374, 4362, 624, 12, 9258, 5043, 1969, 6644, 2265, 11, 829, 11, 17749, 2458, 11, 9934, 11, 2841, 26564, 11, 323, 702, 31206, 624, 12, 1446, 27732, 1191, 697, 2033, 448, 366, 76303, 38283, 29, 320, 67685, 4772, 8, 323, 835, 448, 690, 76303, 38283, 29, 320, 85777, 4772, 340, 12, 3155, 537, 990, 1140, 4334, 18822, 311, 6851, 6220, 476, 1034, 1995, 429, 374, 2669, 3897, 304, 279, 2390, 5944, 624, 12, 3155, 537, 2550, 2197, 429, 525, 279, 1852, 438, 279, 1482, 2197, 624, 12, 1446, 60875, 5889, 304, 8453, 7, 104811, 8, 421, 3204, 13, 151645, 198, 151644, 77091, 198, 151657, 198, 27, 1688, 96598, 4136, 3152, 397, 27, 16181, 79194, 397, 35, 13659, 32112, 5333, 10130, 36342, 24083, 11291, 198, 522, 16181, 397, 27, 16181, 86582, 18981, 397, 29172, 11, 2068, 42717, 7706, 198, 522, 16181, 397, 522, 1688, 397, 151658, 151645, 198, 151644, 872, 198, 151665, 198, 9485, 525, 279, 2038, 2711, 3059, 13, 7036, 429, 1493, 1231, 476, 1231, 537, 2924, 279, 2480, 4226, 13, 1084, 374, 705, 311, 498, 311, 10279, 421, 498, 1184, 311, 1618, 803, 7375, 311, 9567, 803, 1995, 382, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 18008, 4130, 14, 29172, 9199, 37255, 18002, 69233, 20, 15, 8125, 24, 17, 198, 322, 32149, 96059, 374, 264, 22188, 1618, 892, 8473, 274, 624, 2830, 320, 82, 353, 5475, 8, 32149, 96059, 368, 1465, 341, 6725, 70419, 445, 24617, 19424, 2812, 3538, 389, 1018, 82, 497, 274, 5423, 64091, 340, 853, 11446, 83535, 1141, 5423, 64091, 11, 274, 31010, 2398, 630, 1313, 16403, 2582, 2036, 341, 197, 44814, 3056, 917, 1565, 2236, 2974, 81907, 8805, 630, 322, 16403, 3050, 13469, 16403, 1681, 624, 322, 3703, 1110, 14120, 91131, 905, 14, 29172, 45389, 10508, 26559, 25584, 369, 803, 5785, 624, 2830, 320, 82, 353, 5475, 8, 16403, 3050, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 1465, 341, 8810, 2353, 48, 1669, 330, 77, 698, 40668, 48, 1669, 330, 4259, 1837, 197, 322, 7854, 1681, 369, 1759, 2354, 44265, 624, 2405, 4051, 4772, 2972, 5814, 5632, 198, 10676, 1669, 435, 20893, 198, 18534, 1669, 575, 15685, 741, 2023, 595, 11, 348, 1669, 2088, 2804, 341, 197, 743, 2422, 3747, 8, 961, 220, 16, 341, 298, 853, 7013, 13080, 1006, 571, 197, 1, 11808, 3239, 1018, 82, 7533, 82, 497, 595, 11, 348, 568, 2522, 19886, 69497, 340, 197, 197, 532, 197, 8961, 595, 341, 197, 2722, 3930, 48, 510, 298, 8810, 2353, 2507, 11, 1848, 1669, 33317, 67107, 3747, 58, 15, 2546, 298, 743, 1848, 961, 2092, 341, 571, 853, 7013, 13080, 1006, 464, 197, 1, 11808, 3930, 1018, 82, 25, 1018, 82, 497, 348, 11, 1848, 568, 2522, 19886, 69497, 340, 298, 197, 532, 298, 743, 3930, 2507, 621, 220, 15, 341, 571, 853, 7013, 13080, 1006, 464, 197, 1, 11808, 3930, 1018, 67, 497, 3930, 2507, 568, 2522, 19886, 69497, 340, 298, 197, 532, 298, 50108, 1214, 2353, 284, 3930, 2507, 198, 197, 2722, 4347, 48, 510, 298, 50108, 61958, 284, 348, 58, 15, 921, 197, 11940, 510, 298, 853, 7013, 13080, 445, 11808, 3239, 1018, 82, 497, 595, 568, 2522, 19886, 69497, 340, 197, 197, 532, 197, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 18008, 4130, 14, 29172, 9199, 37255, 18002, 69233, 15, 8125, 19, 23, 198, 322, 2955, 320, 66, 8, 220, 17, 15, 16, 21, 12, 17, 15, 16, 24, 27612, 24561, 11, 4848, 624, 2289, 322, 10103, 1212, 279, 8914, 1876, 11, 6079, 220, 17, 13, 15, 320, 1782, 330, 9827, 797, 322, 498, 1231, 537, 990, 419, 1034, 3650, 304, 8733, 448, 279, 1876, 624, 322, 1446, 1231, 6851, 264, 2975, 315, 279, 1876, 518, 198, 2289, 322, 28080, 1110, 2136, 5096, 2659, 6971, 10845, 12, 17, 13, 15, 198, 2289, 322, 10878, 2567, 553, 8415, 2329, 476, 7230, 311, 304, 4378, 11, 3162, 198, 322, 4237, 1212, 279, 1876, 374, 4237, 389, 458, 330, 1911, 3424, 1, 11389, 345, 322, 6007, 7427, 2726, 11342, 3008, 4137, 9297, 11, 2987, 3158, 476, 6131, 624, 322, 3496, 279, 1876, 369, 279, 3151, 4128, 10012, 8541, 323, 198, 322, 9481, 1212, 279, 1876, 624, 1722, 19424, 9199, 271, 474, 2399, 197, 42884, 8931, 698, 197, 21871, 698, 197, 42200, 698, 197, 32468, 15627, 698, 197, 32468, 57254, 698, 197, 59658, 698, 197, 39744, 1837, 197, 9749, 905, 25525, 11582, 72, 14, 14604, 698, 197, 9749, 905, 14, 29870, 14, 9855, 3366, 30593, 21492, 76196, 2972, 698, 197, 9749, 905, 14, 29870, 14, 9855, 3366, 21902, 7530, 5252, 698, 197, 9749, 905, 14, 29870, 14, 9855, 3366, 21902, 20936, 798, 698, 197, 9749, 905, 14, 29870, 14, 9855, 3366, 21902, 19413, 698, 197, 9749, 905, 14, 29870, 14, 9855, 3366, 21902, 54544, 746, 698, 692, 322, 8422, 35455, 40549, 19424, 36342, 624, 1313, 8422, 2036, 341, 25873, 262, 5532, 198, 60439, 2959, 4772, 2972, 11716, 198, 630, 322, 1532, 5475, 11450, 264, 501, 8422, 624, 2830, 1532, 5475, 8754, 5532, 11, 4772, 2959, 4772, 2972, 11716, 8, 353, 5475, 341, 853, 609, 5475, 90, 1676, 11, 4772, 2959, 532, 630, 322, 19954, 4675, 264, 7013, 369, 274, 624, 2830, 320, 82, 353, 5475, 8, 19954, 368, 1758, 31010, 341, 7000, 1669, 25798, 7121, 9523, 741, 7000, 2234, 4283, 85, 17, 19632, 26539, 497, 7013, 38968, 1141, 62559, 3050, 1171, 853, 435, 198, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 45714, 8749, 4322, 617, 261, 15351, 18002, 69233, 19, 24, 8125, 21, 21, 198, 197, 322, 16571, 3050, 33045, 2168, 3055, 432, 21189, 11540, 504, 26588, 19424, 624, 197, 322, 1084, 374, 264, 1293, 4303, 1882, 624, 197, 18553, 11, 1848, 1669, 1532, 11196, 3050, 7, 17, 15, 15, 11, 26588, 340, 743, 1848, 961, 2092, 341, 197, 6725, 26133, 3964, 340, 197, 630, 67009, 1669, 59807, 7121, 9523, 741, 67009, 63623, 35460, 6267, 3050, 4292, 197, 197, 17856, 445, 3806, 1138, 67009, 63623, 4283, 38188, 497, 11540, 31010, 4292, 197, 197, 17856, 445, 2946, 1138, 67009, 63623, 4283, 38188, 497, 6267, 3050, 4292, 197, 197, 17856, 445, 3806, 5130, 6725, 70419, 445, 74819, 389, 1018, 82, 497, 8844, 13986, 340, 6725, 26133, 19886, 68334, 96059, 7307, 268, 13986, 11, 9273, 1171, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 18008, 4130, 18008, 4130, 4030, 37255, 18002, 69233, 19, 22, 8125, 23, 16, 198, 853, 609, 5475, 515, 197, 79659, 345, 197, 197, 3564, 4703, 19963, 3050, 12805, 11, 65949, 1326, 197, 197, 3564, 29978, 2995, 3050, 12805, 11, 4772, 2959, 11, 609, 3675, 5668, 6570, 22655, 10472, 11, 65949, 1326, 197, 25873, 345, 197, 532, 630, 322, 19954, 4675, 279, 10130, 7013, 624, 2830, 320, 82, 353, 5475, 8, 19954, 368, 1758, 31010, 341, 7000, 1669, 25798, 7121, 9523, 2822, 7000, 9046, 1255, 11603, 10538, 14099, 1141, 29856, 1171, 7000, 9046, 1255, 11603, 1214, 266, 2251, 10105, 1141, 29856, 4390, 7000, 2234, 4283, 12120, 497, 7013, 38968, 1141, 38107, 3050, 4390, 7000, 23442, 4283, 29172, 14, 38188, 497, 7013, 38968, 1141, 6225, 19963, 3050, 31421, 1171, 7000, 23442, 4283, 22803, 5457, 16, 14, 29172, 4322, 1097, 2995, 497, 274, 63559, 2995, 3050, 31421, 692, 197, 322, 328, 13280, 608, 8349, 87146, 299, 69, 36342, 624, 7000, 1321, 629, 35460, 1758, 13275, 60421, 44, 2200, 692, 853, 435, 198, 630, 2830, 320, 82, 353, 5475, 8, 2820, 3050, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 1465, 341, 11009, 991, 33655, 3622, 11, 330, 3925, 1138, 853, 2092, 198, 630, 2830, 320, 82, 353, 5475, 8, 32149, 96059, 368, 1465, 341, 6725, 70419, 445, 24617, 13291, 3538, 389, 1018, 82, 497, 274, 5423, 64091, 340, 853, 11446, 83535, 1141, 5423, 64091, 11, 274, 31010, 2398, 3733, 151666, 198, 151645, 198, 151644, 77091, 198, 151657, 198, 27, 1688, 96598, 4136, 3152, 397, 27, 16181, 86582, 18981, 397, 19794, 11, 55627, 42717, 7706, 198, 522, 16181, 397, 27, 16181, 79194, 397, 9230, 5333, 11291, 24083, 36342, 3538, 198, 522, 16181, 397, 27, 16181, 96598, 23199, 397, 8092, 14, 8092, 4030, 198, 522, 16181, 397, 522, 1688, 397, 151658, 151645, 198, 151644, 872, 198, 151665, 198, 9485, 525, 279, 2038, 2711, 3059, 13, 7036, 429, 1493, 1231, 476, 1231, 537, 2924, 279, 2480, 4226, 13, 1084, 374, 705, 311, 498, 311, 10279, 421, 498, 1184, 311, 1618, 803, 7375, 311, 9567, 803, 1995, 382, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 21902, 7530, 5252, 7530, 5252, 18002, 69233, 20, 15, 8125, 16, 15, 15, 198, 322, 12104, 11367, 264, 2526, 4247, 311, 384, 624, 2830, 320, 68, 353, 1454, 8, 12104, 5969, 11, 348, 914, 8, 353, 1454, 341, 7727, 17093, 1904, 5969, 11, 348, 340, 853, 384, 198, 630, 322, 2126, 2522, 4675, 279, 1465, 2639, 624, 2830, 320, 68, 353, 1454, 8, 2126, 2522, 368, 526, 341, 853, 384, 4299, 198, 630, 2830, 320, 68, 353, 1454, 8, 4600, 368, 914, 341, 743, 384, 15137, 621, 1591, 341, 197, 853, 8879, 17305, 445, 4030, 1465, 1018, 67, 497, 384, 4299, 340, 197, 532, 853, 8879, 17305, 445, 4030, 1465, 1018, 67, 25, 1018, 82, 497, 384, 4299, 11, 384, 15137, 340, 630, 322, 15495, 3050, 18653, 458, 10130, 7013, 892, 4675, 458, 1465, 624, 1313, 15495, 3050, 2915, 19886, 37508, 11, 353, 1254, 9659, 8, 1465, 271, 322, 42187, 32722, 458, 15495, 3050, 1119, 458, 1758, 89164, 553, 11589, 279, 1465, 198, 322, 5927, 553, 305, 624, 2830, 42187, 3203, 15495, 3050, 8, 1758, 89164, 341, 853, 2915, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 341, 197, 2405, 2639, 526, 198, 197, 2405, 60078, 914, 198, 197, 743, 1848, 1669, 305, 3622, 11, 435, 1215, 1848, 961, 2092, 341, 298, 8961, 384, 1669, 1848, 12832, 1313, 8, 341, 298, 2722, 353, 1454, 510, 571, 2023, 595, 11, 6165, 1669, 2088, 384, 17093, 341, 464, 2023, 8358, 348, 1669, 2088, 6165, 341, 1144, 6692, 15753, 1005, 2212, 5969, 11, 348, 340, 464, 197, 532, 571, 197, 532, 571, 23847, 284, 384, 4299, 198, 571, 9859, 6611, 284, 384, 15137, 198, 298, 11940, 510, 571, 23847, 284, 1758, 66760, 198, 571, 9859, 6611, 284, 384, 6141, 741, 298, 197, 532, 298, 6692, 69794, 13838, 340, 298, 6692, 4073, 10556, 3782, 3964, 6611, 1171, 197, 197, 92, 770, 341, 298, 23847, 284, 1758, 52989, 198, 197, 197, 532, 197, 743, 2639, 2604, 220, 19, 15, 15, 1009, 2639, 961, 220, 19, 15, 19, 341, 298, 6725, 70419, 4430, 67, 1018, 82, 1018, 82, 1018, 82, 497, 2639, 11, 435, 20798, 11, 435, 20893, 17474, 11, 60078, 340, 197, 197, 532, 197, 532, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 8194, 3183, 11603, 3183, 11603, 18002, 69233, 21, 15, 8125, 24, 24, 198, 322, 9926, 2251, 10105, 10953, 14887, 6844, 5946, 624, 2830, 9926, 2251, 10105, 50714, 52295, 77940, 8, 2915, 16913, 1758, 31010, 8, 1758, 31010, 341, 853, 2915, 16913, 1758, 31010, 8, 1758, 31010, 341, 197, 853, 1758, 89164, 18552, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 341, 298, 21375, 1669, 882, 13244, 741, 298, 28144, 83535, 9230, 3622, 11, 435, 340, 298, 60439, 27380, 50714, 11, 435, 568, 10105, 445, 5524, 2251, 1827, 6471, 9730, 93404, 10639, 1171, 197, 197, 3518, 197, 532, 630, 1313, 3255, 2522, 6492, 2036, 341, 28080, 37508, 198, 6692, 5529, 4047, 1807, 198, 43343, 286, 526, 198, 630, 2830, 320, 86, 353, 8548, 2522, 6492, 8, 9645, 4047, 15842, 526, 8, 341, 743, 753, 86, 1418, 5529, 4047, 341, 197, 6692, 10210, 284, 2038, 198, 197, 6692, 1418, 5529, 4047, 284, 830, 198, 197, 6692, 37508, 69794, 15842, 340, 197, 532, 630, 2830, 320, 86, 353, 8548, 2522, 6492, 8, 9645, 1883, 3056, 3782, 8, 320, 396, 11, 1465, 8, 341, 6692, 69794, 19886, 52989, 340, 853, 289, 37508, 4073, 1883, 340, 630, 322, 8104, 14099, 10953, 14887, 2639, 1760, 624, 2830, 8104, 14099, 50714, 52295, 77940, 8, 2915, 16913, 1758, 31010, 8, 1758, 31010, 341, 853, 2915, 16913, 1758, 31010, 8, 1758, 31010, 341, 197, 853, 1758, 89164, 18552, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 341, 298, 71952, 86, 1669, 609, 8548, 2522, 6492, 90, 86, 11, 895, 11, 1758, 52989, 532, 298, 28144, 83535, 9230, 23794, 86, 11, 435, 340, 298, 60439, 27380, 50714, 11, 435, 568, 14099, 4199, 12027, 64109, 23794, 86, 10210, 4579, 39245, 7, 16, 340, 197, 197, 3518, 197, 532, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 18008, 4130, 18008, 4130, 4030, 37255, 18002, 69233, 19, 22, 8125, 23, 16, 198, 853, 609, 5475, 515, 197, 79659, 345, 197, 197, 3564, 4703, 19963, 3050, 12805, 11, 65949, 1326, 197, 197, 3564, 29978, 2995, 3050, 12805, 11, 4772, 2959, 11, 609, 3675, 5668, 6570, 22655, 10472, 11, 65949, 1326, 197, 25873, 345, 197, 532, 630, 322, 19954, 4675, 279, 10130, 7013, 624, 2830, 320, 82, 353, 5475, 8, 19954, 368, 1758, 31010, 341, 7000, 1669, 25798, 7121, 9523, 2822, 7000, 9046, 1255, 11603, 10538, 14099, 1141, 29856, 1171, 7000, 9046, 1255, 11603, 1214, 266, 2251, 10105, 1141, 29856, 4390, 7000, 2234, 4283, 12120, 497, 7013, 38968, 1141, 38107, 3050, 4390, 7000, 23442, 4283, 29172, 14, 38188, 497, 7013, 38968, 1141, 6225, 19963, 3050, 31421, 1171, 7000, 23442, 4283, 22803, 5457, 16, 14, 29172, 4322, 1097, 2995, 497, 274, 63559, 2995, 3050, 31421, 692, 197, 322, 328, 13280, 608, 8349, 87146, 299, 69, 36342, 624, 7000, 1321, 629, 35460, 1758, 13275, 60421, 44, 2200, 692, 853, 435, 198, 630, 2830, 320, 82, 353, 5475, 8, 2820, 3050, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 1465, 341, 11009, 991, 33655, 3622, 11, 330, 3925, 1138, 853, 2092, 198, 630, 2830, 320, 82, 353, 5475, 8, 32149, 96059, 368, 1465, 341, 6725, 70419, 445, 24617, 13291, 3538, 389, 1018, 82, 497, 274, 5423, 64091, 340, 853, 11446, 83535, 1141, 5423, 64091, 11, 274, 31010, 2398, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 30593, 21492, 76196, 4030, 37255, 18002, 69233, 23, 16, 8125, 16, 17, 21, 198, 25873, 284, 2193, 13045, 16273, 2822, 79659, 284, 10472, 23676, 3556, 9147, 14032, 30953, 515, 197, 197, 1, 4352, 788, 330, 4578, 4030, 756, 197, 8824, 853, 609, 5475, 515, 197, 25873, 25, 394, 2193, 345, 197, 79659, 25, 338, 10472, 345, 197, 77446, 1412, 25, 1060, 1182, 1412, 345, 197, 8854, 13298, 61088, 25, 286, 2205, 13298, 61088, 345, 197, 8854, 13298, 2959, 25, 257, 2205, 13298, 2959, 345, 197, 197, 79488, 25, 1797, 18709, 345, 197, 57279, 25, 338, 3553, 345, 197, 197, 1826, 6295, 25, 2290, 1299, 6295, 345, 197, 60439, 18327, 1693, 2043, 25, 4772, 18327, 1693, 2043, 345, 197, 197, 19979, 25, 1060, 9109, 345, 197, 197, 14891, 18190, 25, 1843, 2170, 18190, 345, 197, 532, 630, 322, 19954, 4675, 458, 1758, 31010, 369, 274, 624, 2830, 320, 82, 353, 5475, 8, 19954, 368, 1758, 31010, 341, 7000, 1669, 25798, 7121, 9523, 2822, 7000, 9046, 1255, 11603, 10538, 14099, 1141, 29856, 1171, 7000, 9046, 1255, 11603, 1214, 266, 2251, 10105, 1141, 29856, 4390, 7000, 2234, 4283, 12120, 497, 7013, 38968, 1141, 38107, 3050, 1171, 7000, 2234, 4283, 878, 1880, 497, 7013, 38968, 1141, 4125, 1880, 3973, 3050, 4390, 7000, 39825, 4283, 14082, 9388, 4578, 4472, 36339, 9388, 36339, 9545, 7013, 38968, 1141, 3597, 5668, 3050, 1171, 7000, 90478, 4283, 14082, 9388, 4578, 9545, 7013, 38968, 1141, 6858, 5668, 3050, 1171, 7000, 2234, 4283, 14082, 9388, 4578, 9545, 7013, 38968, 1141, 59279, 3050, 4390, 7000, 2234, 4283, 81907, 9388, 23476, 4472, 14082, 497, 7013, 38968, 1141, 6420, 4624, 3050, 4390, 7000, 2234, 4283, 1607, 1057, 497, 7013, 38968, 1141, 6420, 3050, 4390, 7000, 23442, 4283, 1826, 6295, 84460, 9388, 4578, 9545, 7013, 38968, 1141, 68225, 48795, 5668, 3050, 4390, 7000, 2234, 4283, 8611, 497, 7013, 38968, 1141, 670, 13298, 3050, 4390, 7000, 23442, 1006, 197, 197, 3115, 10481, 3446, 14070, 81771, 6295, 84460, 9388, 4578, 4472, 36339, 9388, 36339, 24375, 197, 53326, 38968, 1141, 950, 14070, 18327, 48795, 5668, 3050, 4390, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 8194, 3183, 11603, 3183, 11603, 4452, 18002, 69233, 16, 15, 18, 8125, 16, 20, 21, 198, 2830, 3393, 2522, 14099, 1155, 353, 8840, 836, 8, 341, 78216, 1669, 3056, 1235, 341, 197, 41653, 1843, 914, 198, 197, 53326, 286, 2915, 19886, 37508, 11, 353, 1254, 9659, 340, 197, 42400, 2522, 914, 198, 197, 59403, 197, 197, 515, 298, 197, 1, 3194, 7013, 14579, 220, 17, 15, 15, 756, 298, 29244, 19886, 37508, 11, 353, 1254, 9659, 8, 14573, 298, 197, 1, 17, 15, 15, 756, 197, 197, 2137, 341, 298, 197, 1, 59079, 1760, 220, 17, 15, 15, 756, 298, 29244, 3622, 1758, 37508, 11, 716, 353, 1254, 9659, 8, 314, 6399, 44747, 3622, 11, 330, 3925, 899, 1153, 298, 197, 1, 17, 15, 15, 756, 197, 197, 2137, 341, 298, 197, 1, 4934, 4247, 756, 298, 29244, 3622, 1758, 37508, 11, 716, 353, 1254, 9659, 8, 314, 289, 69794, 7, 20, 15, 15, 8, 1153, 298, 197, 1, 20, 15, 15, 756, 197, 197, 2137, 341, 298, 197, 1, 35673, 3270, 4247, 6738, 1172, 10953, 1156, 1618, 756, 298, 29244, 3622, 1758, 37508, 11, 716, 353, 1254, 9659, 8, 314, 289, 69794, 7, 19, 15, 15, 1215, 289, 69794, 7, 20, 15, 15, 8, 1153, 298, 197, 1, 19, 15, 15, 756, 197, 197, 1583, 197, 532, 2023, 8358, 1273, 1669, 2088, 7032, 341, 197, 3244, 16708, 8623, 30514, 11, 2915, 1155, 353, 8840, 836, 8, 341, 298, 17957, 1669, 1373, 7121, 1155, 692, 298, 79659, 1669, 52295, 7121, 2271, 10803, 19814, 2092, 692, 298, 7000, 1669, 25798, 7121, 9523, 741, 298, 7000, 9046, 38866, 14099, 50714, 1171, 298, 7000, 2234, 4283, 7975, 9388, 7975, 9545, 1273, 31171, 692, 298, 53183, 11, 2936, 1669, 1273, 1314, 12101, 5475, 2601, 340, 298, 16867, 2936, 2822, 298, 2023, 600, 1669, 220, 15, 26, 600, 366, 220, 20, 26, 600, 1027, 341, 571, 197, 6878, 1848, 1669, 1758, 2234, 28197, 17305, 445, 1254, 86791, 82, 60555, 10776, 497, 10789, 1171, 571, 17957, 35699, 3964, 340, 298, 197, 630, 298, 17957, 12808, 7, 16, 11, 2422, 50714, 808, 9601, 1005, 2507, 388, 12145, 298, 2023, 8358, 348, 1669, 2088, 10472, 808, 9601, 1005, 2507, 388, 368, 341, 571, 17957, 12808, 8623, 56835, 2522, 11, 348, 2967, 2398, 571, 17957, 12808, 1548, 21, 19, 7, 20, 701, 348, 6167, 2398, 571, 17957, 12808, 9147, 14032, 30953, 515, 464, 197, 1, 32540, 788, 330, 7975, 756, 464, 197, 1, 4393, 788, 256, 330, 3806, 756, 571, 197, 2137, 348, 73522, 2398, 298, 197, 532, 197, 197, 3518, 197, 532, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 21902, 7530, 5252, 7530, 5252, 18002, 69233, 15, 8125, 20, 19, 198, 322, 2955, 320, 66, 8, 220, 17, 15, 16, 21, 12, 17, 15, 16, 24, 27612, 24561, 11, 4848, 624, 2289, 322, 10103, 1212, 279, 8914, 1876, 11, 6079, 220, 17, 13, 15, 320, 1782, 330, 9827, 797, 322, 498, 1231, 537, 990, 419, 1034, 3650, 304, 8733, 448, 279, 1876, 624, 322, 1446, 1231, 6851, 264, 2975, 315, 279, 1876, 518, 198, 2289, 322, 28080, 1110, 2136, 5096, 2659, 6971, 10845, 12, 17, 13, 15, 198, 2289, 322, 10878, 2567, 553, 8415, 2329, 476, 7230, 311, 304, 4378, 11, 3162, 198, 322, 4237, 1212, 279, 1876, 374, 4237, 389, 458, 330, 1911, 3424, 1, 11389, 345, 322, 6007, 7427, 2726, 11342, 3008, 4137, 9297, 11, 2987, 3158, 476, 6131, 624, 322, 3496, 279, 1876, 369, 279, 3151, 4128, 10012, 8541, 323, 198, 322, 9481, 1212, 279, 1876, 624, 1722, 7013, 271, 474, 2399, 197, 21871, 698, 197, 32468, 15627, 1837, 197, 9749, 905, 14, 29870, 14, 9855, 3366, 21902, 19413, 698, 692, 322, 4600, 18653, 458, 10130, 7013, 1465, 892, 42569, 23156, 2639, 323, 7102, 198, 322, 311, 387, 738, 304, 279, 10130, 2033, 624, 1313, 4600, 2036, 341, 23847, 526, 198, 20883, 1758, 15753, 198, 21169, 262, 914, 198, 630, 322, 4600, 69, 11450, 264, 501, 4600, 448, 90908, 11297, 36566, 13, 35990, 311, 220, 20, 15, 15, 1465, 624, 2830, 4600, 69, 20698, 914, 11, 2827, 2503, 4970, 28875, 353, 1454, 341, 853, 609, 1454, 515, 197, 23847, 25, 1758, 66760, 345, 197, 20883, 25, 1758, 15753, 38837, 197, 21169, 25, 262, 8879, 17305, 20698, 11, 2827, 1112, 1326, 197, 532, 630, 322, 4600, 2522, 11450, 458, 4287, 1943, 1465, 448, 2639, 274, 624, 2830, 4600, 2522, 1141, 526, 8, 353, 1454, 341, 853, 4600, 69, 80821, 2522, 1141, 340, 630, 322, 8104, 7289, 264, 2526, 2639, 389, 384, 624, 2830, 320, 68, 353, 1454, 8, 8104, 1141, 526, 8, 353, 1454, 341, 7727, 4299, 284, 274, 198, 853, 384, 198, 630, 322, 12104, 11367, 264, 2526, 4247, 311, 384, 624, 2830, 320, 68, 353, 1454, 8, 12104, 5969, 11, 348, 914, 8, 353, 1454, 341, 7727, 17093, 1904, 5969, 11, 348, 340, 853, 384, 198, 630, 9612, 300, 10758, 17286, 14, 29870, 4698, 81, 3366, 18008, 4130, 14, 29172, 9199, 37255, 18002, 69233, 20, 15, 8125, 24, 17, 198, 322, 32149, 96059, 374, 264, 22188, 1618, 892, 8473, 274, 624, 2830, 320, 82, 353, 5475, 8, 32149, 96059, 368, 1465, 341, 6725, 70419, 445, 24617, 19424, 2812, 3538, 389, 1018, 82, 497, 274, 5423, 64091, 340, 853, 11446, 83535, 1141, 5423, 64091, 11, 274, 31010, 2398, 630, 1313, 16403, 2582, 2036, 341, 197, 44814, 3056, 917, 1565, 2236, 2974, 81907, 8805, 630, 322, 16403, 3050, 13469, 16403, 1681, 624, 322, 3703, 1110, 14120, 91131, 905, 14, 29172, 45389, 10508, 26559, 25584, 369, 803, 5785, 624, 2830, 320, 82, 353, 5475, 8, 16403, 3050, 3622, 1758, 37508, 11, 435, 353, 1254, 9659, 8, 1465, 341, 8810, 2353, 48, 1669, 330, 77, 698, 40668, 48, 1669, 330, 4259, 1837, 197, 322, 7854, 1681, 369, 1759, 2354, 44265, 624, 2405, 4051, 4772, 2972, 5814, 5632, 198, 10676, 1669, 435, 20893, 198, 18534, 1669, 575, 15685, 741, 2023, 595, 11, 348, 1669, 2088, 2804, 341, 197, 743, 2422, 3747, 8, 961, 220, 16, 341, 298, 853, 7013, 13080, 1006, 571, 197, 1, 11808, 3239, 1018, 82, 7533, 82, 497, 595, 11, 348, 568, 2522, 19886, 69497, 340, 197, 197, 532, 197, 8961, 595, 341, 197, 2722, 3930, 48, 510, 298, 8810, 2353, 2507, 11, 1848, 1669, 33317, 67107, 3747, 58, 15, 2546, 298, 743, 1848, 961, 2092, 341, 571, 853, 7013, 13080, 1006, 464, 197, 1, 11808, 3930, 1018, 82, 25, 1018, 82, 497, 348, 11, 1848, 568, 2522, 19886, 69497, 340, 298, 197, 532, 298, 743, 3930, 2507, 621, 220, 15, 341, 571, 853, 7013, 13080, 1006, 464, 197, 1, 11808, 3930, 1018, 67, 497, 3930, 2507, 568, 2522, 19886, 69497, 340, 298, 197, 532, 298, 50108, 1214, 2353, 284, 3930, 2507, 198, 197, 2722, 4347, 48, 510, 298, 50108, 61958, 284, 348, 58, 15, 921, 197, 11940, 510, 298, 853, 7013, 13080, 445, 11808, 3239, 1018, 82, 497, 595, 568, 2522, 19886, 69497, 340, 197, 197, 532, 197, 3733, 151666, 198, 151645, 198, 151644, 872, 271, 7039, 11, 3118, 389, 279, 3881, 2266, 1995, 11, 2550, 279, 1102, 5961, 438, 2567, 11, 25470, 2701, 279, 2550, 3561, 8502, 2041, 894, 5107, 2550, 13, 9258, 1969, 1191, 448, 366, 76303, 38283, 29, 323, 835, 448, 690, 76303, 38283, 14276, 3155, 537, 1618, 894, 5392, 624, 334, 98743, 1019, 12, 3155, 537, 2550, 2197, 429, 525, 279, 1852, 438, 279, 1482, 2197, 624, 12, 1416, 279, 2390, 374, 1602, 4285, 11, 279, 2197, 5944, 1265, 387, 438, 4285, 438, 3204, 624, 12, 1446, 27732, 5889, 304, 8453, 7, 104811, 8, 151645, 198, 151644, 77091, 198, 27, 76303, 38283, 397, 515, 220, 330, 3615, 788, 2278, 262, 341, 414, 330, 2102, 788, 330, 28648, 12, 29172, 23904, 756, 414, 330, 606, 788, 330, 35, 13659, 32112, 5333, 756, 414, 330, 37555, 2458, 788, 4383, 8092, 14, 8092, 4030, 37255, 18002, 497, 330, 22803, 14, 29172, 9199, 37255, 18002, 497, 330, 2740, 61764, 29172, 14, 42315, 82, 18002, 497, 330, 2740, 61764, 29172, 3470, 68164, 18002, 497, 330, 2740, 61764, 29172, 29581, 18002, 8097, 414, 330, 40581, 788, 330, 50377, 40549, 32112, 5333, 43589, 100700, 111116, 1773, 65577, 16447, 3366, 69162, 40549, 32112, 10130, 5333, 348, 17, 43589, 100873, 114288, 33071, 101884, 3837, 100630, 100811, 65101, 83751, 72225, 5373, 105713, 39352, 5373, 35112, 6567, 36548, 105679, 10130, 10236, 104, 107, 27442, 1773, 66833, 41932, 608, 85, 17, 14, 51461, 117, 78882, 27442, 5373, 14, 85, 17, 19632, 26539, 74866, 106, 23656, 78882, 27442, 5373, 14, 85, 17, 9388, 606, 4472, 42315, 82, 9388, 16291, 92, 97259, 227, 23990, 78882, 27442, 5373, 14, 85, 17, 9388, 606, 4472, 38145, 1279, 9388, 36339, 92, 23404, 10236, 104, 107, 27442, 5373, 14, 85, 17, 9388, 606, 4472, 38145, 1279, 29581, 14, 220, 52526, 78882, 27442, 9370, 101884, 1773, 66394, 103991, 78882, 27442, 9370, 10130, 81454, 9909, 3806, 5373, 6221, 5373, 2946, 5373, 14424, 5373, 35433, 64359, 3144, 6567, 44401, 28330, 5373, 34859, 14, 102808, 68805, 5373, 44091, 16476, 33108, 32100, 54542, 1773, 100630, 104510, 100674, 5373, 43815, 31905, 105493, 5373, 17177, 99922, 52526, 5373, 63789, 27442, 99448, 41683, 49567, 104112, 105539, 1773, 99553, 104621, 37029, 19793, 26355, 3837, 101987, 100007, 67338, 100142, 40549, 41479, 95, 17523, 78882, 57218, 16447, 3366, 220, 108221, 63276, 414, 330, 5864, 26564, 788, 330, 35, 13659, 32112, 5333, 54851, 105168, 9370, 107736, 101931, 3837, 43815, 101162, 102024, 100136, 100166, 104542, 3837, 102298, 101213, 108247, 78882, 27442, 101884, 3837, 77288, 71268, 106466, 103967, 9370, 25414, 18137, 96, 236, 33983, 3837, 104964, 46944, 111116, 15946, 100873, 53481, 55338, 78882, 27442, 3837, 106431, 100642, 101348, 17177, 63276, 414, 330, 4648, 31206, 788, 330, 8996, 698, 262, 1153, 262, 341, 414, 330, 2102, 788, 330, 2245, 41387, 71312, 23904, 756, 414, 330, 606, 788, 330, 110195, 17881, 10130, 5333, 756, 414, 330, 37555, 2458, 788, 4383, 8092, 14, 8092, 4030, 37255, 18002, 497, 330, 8611, 34827, 4030, 37255, 18002, 497, 330, 50395, 14, 13131, 388, 2836, 37255, 18002, 497, 330, 5834, 21492, 76196, 4030, 37255, 18002, 497, 330, 22803, 18008, 4130, 4030, 37255, 18002, 8097, 414, 330, 40581, 788, 330, 50377, 16447, 3366, 38433, 226, 110195, 17881, 10130, 5333, 43589, 100873, 111116, 1773, 100700, 65577, 20713, 5373, 13298, 5373, 31133, 5373, 16219, 5373, 11066, 12, 1552, 44054, 93437, 104186, 101979, 104516, 107736, 1773, 100630, 20713, 43589, 608, 12120, 5373, 14, 65512, 5373, 14, 12885, 10236, 104, 107, 27442, 24968, 13298, 43589, 608, 12120, 5373, 14, 38145, 1279, 5373, 14, 6120, 10236, 104, 107, 27442, 24968, 31133, 43589, 608, 65512, 5373, 14, 2388, 19842, 5373, 14, 4059, 466, 824, 10236, 104, 107, 27442, 24968, 11066, 12, 1552, 43589, 608, 14082, 5373, 14, 81907, 5373, 14, 1826, 6295, 10236, 104, 107, 27442, 24968, 16219, 43589, 608, 29172, 14, 38188, 5373, 14, 22803, 5457, 16, 14, 29172, 4322, 1097, 2995, 10236, 104, 107, 27442, 1773, 32664, 103991, 5333, 220, 66833, 41932, 10130, 81454, 5373, 3144, 8908, 115, 107, 66569, 5373, 34859, 32665, 5373, 102808, 68805, 5373, 32100, 16476, 33108, 37029, 102122, 1773, 66394, 110195, 106588, 47872, 11622, 100145, 5373, 20074, 114442, 33108, 32100, 54542, 100674, 1773, 99553, 5333, 8908, 108, 225, 11622, 19793, 26355, 33108, 102705, 105866, 63276, 414, 330, 5864, 26564, 788, 330, 110195, 17881, 10130, 5333, 6567, 114, 231, 81217, 101213, 101970, 110195, 3837, 103991, 110195, 118755, 105071, 5333, 10236, 104, 107, 27442, 33108, 104559, 3837, 85106, 59879, 110195, 71817, 101348, 17177, 5122, 16810, 5333, 5373, 13298, 5333, 5373, 31133, 5333, 5373, 11066, 12, 1552, 5333, 5373, 16219, 5333, 3837, 105920, 100700, 66394, 103991, 110195, 106708, 107736, 63276, 414, 330, 4648, 31206, 788, 330, 14004, 756, 414, 330, 5864, 788, 2278, 286, 341, 688, 330, 2102, 788, 330, 8092, 71312, 23904, 756, 688, 330, 606, 788, 330, 16810, 10130, 5333, 756, 688, 330, 37555, 2458, 788, 4383, 8092, 14, 8092, 4030, 37255, 18002, 497, 330, 8092, 14, 8092, 2972, 25085, 18002, 8097, 688, 330, 40581, 788, 330, 100700, 65577, 20713, 44054, 93437, 9370, 10130, 5333, 46602, 98, 39426, 1773, 100630, 99722, 101071, 78882, 27442, 608, 12120, 5373, 47, 17, 47, 41479, 96, 51827, 78882, 27442, 608, 65512, 5373, 20074, 62189, 78882, 27442, 608, 12885, 5373, 44091, 51154, 78882, 27442, 49567, 1773, 66394, 103991, 78882, 27442, 106708, 101884, 3837, 100630, 10130, 81454, 5373, 34859, 32665, 5373, 102808, 68805, 5373, 32100, 54542, 1773, 66833, 41932, 20713, 69372, 98749, 67338, 100001, 5333, 220, 106961, 110195, 108221, 3837, 100630, 69041, 40179, 62579, 49026, 5373, 45181, 17116, 58143, 92894, 20713, 39095, 27366, 20074, 1773, 99553, 5333, 8908, 108, 225, 11622, 19793, 26355, 3837, 101987, 104621, 100007, 57218, 20713, 220, 108221, 1773, 65577, 20713, 64388, 117, 99996, 32665, 3837, 29524, 14397, 3034, 5373, 78882, 39426, 85767, 5373, 62189, 104238, 49567, 63276, 688, 330, 5864, 26564, 788, 330, 16810, 10130, 5333, 54851, 106215, 110195, 9370, 107736, 3837, 98380, 101162, 101096, 3837, 99558, 102074, 393, 17, 47, 39095, 27366, 33108, 99722, 101071, 3837, 43815, 16530, 102181, 3837, 46944, 111116, 106131, 102994, 55338, 78882, 27442, 3837, 106431, 100642, 101348, 17177, 63276, 688, 330, 4648, 31206, 788, 330, 8996, 698, 286, 1153, 286, 341, 688, 330, 2102, 788, 330, 8611, 71312, 23904, 756, 688, 330, 606, 788, 330, 13298, 10130, 5333, 756, 688, 330, 37555, 2458, 788, 4383, 8611, 34827, 4030, 37255, 18002, 497, 330, 8611, 34827, 2972, 25085, 18002, 8097, 688, 330, 40581, 788, 330, 100700, 65577, 17116, 44054, 93437, 9370, 10130, 5333, 46602, 98, 39426, 1773, 100630, 99722, 101071, 78882, 27442, 608, 12120, 5373, 35112, 53497, 246, 99871, 78882, 27442, 608, 38145, 1279, 9388, 36339, 92, 5373, 52526, 54542, 78882, 27442, 608, 6120, 5373, 103414, 39352, 78882, 27442, 49567, 1773, 66394, 103991, 78882, 27442, 106708, 101884, 3837, 100630, 10130, 81454, 5373, 34859, 32665, 5373, 102808, 68805, 5373, 32100, 54542, 1773, 66833, 41932, 17116, 69372, 98749, 100622, 105975, 92374, 99553, 20074, 47874, 3837, 100630, 23404, 53497, 246, 99871, 5373, 98671, 99658, 86312, 39352, 5373, 118878, 107101, 1773, 99553, 5333, 8908, 108, 225, 11622, 19793, 26355, 3837, 101987, 100007, 52526, 33108, 62189, 23404, 62262, 1773, 65577, 17116, 64388, 117, 99996, 32665, 3837, 29524, 105653, 33447, 78882, 85767, 5373, 103414, 100786, 102498, 5373, 105173, 104238, 49567, 63276, 688, 330, 5864, 26564, 788, 330, 13298, 10130, 5333, 54851, 106215, 110195, 9370, 107736, 3837, 99558, 102074, 23404, 53497, 246, 99871, 33108, 103414, 39352, 3837, 98380, 101162, 102024, 3837, 46944, 111116, 73670, 100873, 53481, 55338, 78556, 78882, 27442, 3837, 106431, 100642, 101348, 17177, 63276, 688, 330, 4648, 31206, 788, 330, 8996, 698, 286, 1153, 286, 341, 688, 330, 2102, 788, 330, 50395, 71312, 23904, 756, 688, 330, 606, 788, 330, 31133, 10130, 5333, 756, 688, 330, 37555, 2458, 788, 4383, 50395, 14, 13131, 388, 2836, 37255, 18002, 497, 330, 50395, 14, 65512, 2972, 25085, 18002, 497, 330, 50395, 90228, 466, 69, 509, 1451, 25085, 18002, 8097, 688, 330, 40581, 788, 330, 100700, 65577, 40179, 44054, 93437, 9370, 10130, 5333, 46602, 98, 39426, 1773, 100630, 99722, 101071, 78882, 27442, 608, 12120, 5373, 47, 17, 47, 41479, 96, 51827, 78882, 27442, 608, 65512, 5373, 32664, 49567, 92374, 99879, 78882, 27442, 608, 2388, 19842, 5373, 23305, 110042, 78882, 27442, 608, 4059, 466, 824, 5373, 100787, 27369, 78882, 27442, 49567, 1773, 66394, 103991, 78882, 27442, 106708, 101884, 3837, 100630, 10130, 81454, 5373, 34859, 32665, 5373, 102808, 68805, 5373, 32100, 54542, 1773, 66833, 41932, 40179, 69372, 98749, 102020, 393, 17, 47, 10236, 121, 239, 68065, 3837, 100630, 32664, 49567, 92374, 39352, 5373, 64064, 17177, 28291, 5373, 71356, 100786, 102498, 101999, 1773, 99553, 5333, 8908, 108, 225, 11622, 19793, 26355, 3837, 101987, 20713, 69372, 98749, 67338, 40179, 69425, 46451, 92894, 32664, 49567, 92374, 1773, 65577, 40179, 64388, 117, 99996, 32665, 3837, 29524, 32664, 49567, 92374, 104238, 5373, 71356, 85767, 5373, 102111, 104118, 49567, 63276, 688, 330, 5864, 26564, 788, 330, 31133, 10130, 5333, 54851, 106215, 110195, 9370, 107736, 3837, 99558, 102074, 393, 17, 47, 66521, 237, 47872, 33108, 32664, 49567, 92374, 39352, 3837, 98380, 101162, 101096, 3837, 46944, 111116, 73670, 102994, 55338, 78556, 78882, 27442, 3837, 106431, 100642, 101348, 17177, 63276, 688, 330, 4648, 31206, 788, 330, 8996, 698, 286, 1153, 286, 341, 688, 330, 2102, 788, 330, 5834, 21492, 71312, 23904, 756, 688, 330, 606, 788, 330, 11066, 12, 1552, 10130, 5333, 756, 688, 330, 37555, 2458, 788, 4383, 5834, 21492, 76196, 4030, 37255, 18002, 497, 330, 5834, 21492, 76196, 2972, 25085, 18002, 8097, 688, 330, 40581, 788, 330, 100700, 65577, 7854, 12, 1552, 44054, 93437, 9370, 10130, 5333, 46602, 98, 39426, 1773, 100630, 99722, 101071, 78882, 27442, 608, 12120, 5373, 80158, 100829, 101071, 78882, 27442, 608, 878, 1880, 5373, 105151, 39352, 78882, 27442, 608, 14082, 9388, 4578, 92, 5373, 106871, 44177, 78882, 27442, 608, 81907, 9388, 23476, 4472, 14082, 5373, 105710, 105173, 78882, 27442, 608, 1826, 6295, 84460, 9388, 4578, 92, 5373, 13298, 80528, 78882, 27442, 608, 8611, 10236, 255, 231, 1773, 66394, 103991, 78882, 27442, 106708, 101884, 3837, 100630, 10130, 81454, 5373, 34859, 32665, 5373, 102808, 68805, 5373, 32100, 54542, 1773, 66833, 41932, 7854, 12, 1552, 69372, 98749, 39352, 105151, 26939, 106208, 9370, 100261, 99759, 3837, 100630, 99960, 103414, 105173, 5373, 105537, 106637, 5373, 105151, 104738, 1773, 99553, 5333, 8908, 108, 225, 11622, 19793, 26355, 3837, 101987, 100007, 51154, 33108, 50007, 105151, 27369, 1773, 65577, 7854, 12, 1552, 64388, 117, 99996, 32665, 3837, 29524, 105173, 104190, 5373, 105537, 104238, 5373, 105653, 85767, 49567, 63276, 688, 330, 5864, 26564, 788, 330, 11066, 12, 1552, 10130, 5333, 54851, 106215, 110195, 9370, 107736, 3837, 99558, 102074, 105151, 108069, 99960, 103414, 105173, 3837, 98380, 101162, 102024, 3837, 46944, 111116, 73670, 100873, 53481, 55338, 78556, 78882, 27442, 3837, 106431, 100642, 101348, 17177, 63276, 688, 330, 4648, 31206, 788, 330, 8996, 698, 286, 1153, 286, 341, 688, 330, 2102, 788, 330, 22803, 71312, 23904, 756, 688, 330, 606, 788, 330, 16219, 10130, 5333, 756, 688, 330, 37555, 2458, 788, 4383, 22803, 18008, 4130, 4030, 37255, 18002, 497, 330, 22803, 14, 29172, 9199, 37255, 18002, 8097, 688, 330, 40581, 788, 330, 100700, 65577, 32778, 44054, 93437, 9370, 10130, 5333, 46602, 98, 39426, 1773, 100630, 99722, 101071, 78882, 27442, 608, 12120, 5373, 61689, 20742, 101151, 78882, 27442, 608, 29172, 14, 38188, 5373, 98841, 18158, 78882, 27442, 608, 22803, 5457, 16, 14, 29172, 4322, 1097, 2995, 5373, 106130, 100795, 78882, 27442, 608, 85, 17, 19632, 26539, 10236, 255, 231, 1773, 66394, 103991, 78882, 27442, 106708, 101884, 3837, 100630, 10130, 81454, 5373, 34859, 32665, 5373, 102808, 68805, 5373, 32100, 54542, 1773, 66833, 41932, 32778, 69372, 98749, 54542, 100811, 65101, 52526, 33108, 98841, 99259, 3837, 100630, 57621, 54542, 5373, 98841, 18158, 104238, 5373, 116817, 104190, 1773, 99553, 5333, 8908, 108, 225, 11622, 19793, 26355, 3837, 101987, 100007, 85767, 33108, 37029, 32778, 220, 47874, 1773, 65577, 32778, 64388, 117, 99996, 32665, 3837, 29524, 52526, 104238, 5373, 98841, 99259, 85767, 5373, 57621, 108459, 49567, 63276, 688, 330, 5864, 26564, 788, 330, 16219, 10130, 5333, 54851, 106215, 110195, 9370, 107736, 3837, 99558, 102074, 100811, 65101, 52526, 101259, 33108, 98841, 99259, 98380, 3837, 98380, 101162, 101096, 3837, 46944, 111116, 73670, 102994, 55338, 78556, 78882, 27442, 3837, 106431, 100642, 101348, 17177, 63276, 688, 330, 4648, 31206, 788, 330, 8996, 698, 286, 456, 414, 5133, 262, 1153, 262, 341, 414, 330, 2102, 788, 330, 79, 17, 79, 12, 17014, 23904, 756, 414, 330, 606, 788, 330, 47, 17, 47, 66521, 237, 96422, 107736, 756, 414, 330, 37555, 2458, 788, 4383, 15110, 4322, 17, 79, 4322, 17, 79, 57322, 497, 330, 2740, 5523, 48709, 2687, 15222, 14, 5148, 14, 5148, 18002, 497, 330, 2740, 5523, 48709, 2687, 15222, 14, 5148, 7530, 437, 927, 4407, 18002, 497, 330, 2740, 5523, 48709, 2687, 15222, 14, 5148, 50624, 18002, 8097, 414, 330, 40581, 788, 330, 50377, 393, 17, 47, 66521, 237, 96422, 107736, 9370, 100700, 111116, 1773, 104210, 18433, 4322, 17, 79, 4322, 17, 79, 57322, 41479, 248, 64559, 3837, 66833, 41932, 16447, 3366, 393, 17, 47, 10236, 121, 239, 68065, 105778, 68805, 5373, 64064, 101136, 5373, 118376, 102054, 33108, 20074, 107468, 100674, 1773, 100630, 101136, 118098, 23836, 64205, 91282, 5373, 40820, 41299, 43316, 20074, 116226, 68805, 5373, 64064, 100641, 33108, 101999, 102054, 1773, 66394, 8536, 29661, 5373, 8344, 2566, 5373, 12116, 5373, 1900, 5373, 31209, 5373, 9269, 10236, 255, 231, 64205, 109963, 100166, 33108, 105795, 1773, 66833, 41932, 32664, 49567, 92374, 106588, 104516, 101136, 3837, 100630, 64064, 44091, 39352, 5373, 20074, 34859, 104238, 5373, 104242, 100359, 100674, 1773, 99553, 101136, 101884, 19793, 26355, 3837, 101987, 100007, 100641, 393, 17, 47, 32181, 252, 29077, 33108, 107468, 20074, 1773, 65577, 101136, 71109, 39352, 5373, 114288, 33071, 101882, 33108, 99464, 101118, 63276, 414, 330, 5864, 26564, 788, 330, 47, 17, 47, 66521, 237, 96422, 107736, 20412, 104210, 69634, 41479, 248, 64559, 9370, 40820, 41299, 43316, 101136, 3837, 43815, 101162, 102024, 100136, 100166, 32108, 3837, 99558, 102074, 64205, 68805, 33108, 104516, 102054, 3837, 46944, 111116, 73670, 100873, 53481, 101136, 101931, 3837, 106431, 100642, 101348, 17177, 63276, 414, 330, 4648, 31206, 788, 330, 8996, 698, 262, 1153, 262, 341, 414, 330, 2102, 788, 330, 12120, 54785, 23904, 756, 414, 330, 606, 788, 330, 99722, 101071, 57218, 39352, 5333, 756, 414, 330, 37555, 2458, 788, 4383, 2740, 14, 12120, 2028, 14, 32225, 18002, 497, 330, 2740, 14, 12120, 2028, 46619, 261, 18002, 497, 330, 6031, 7530, 5252, 7530, 5252, 18002, 497, 330, 2740, 3183, 11603, 3183, 11603, 18002, 8097, 414, 330, 40581, 788, 330, 50377, 99722, 101071, 57218, 39352, 5333, 43589, 100700, 111116, 1773, 65577, 55338, 110195, 9370, 99722, 101071, 78882, 27442, 3837, 100630, 105266, 33071, 101071, 608, 12120, 5373, 80158, 100829, 33071, 101071, 608, 878, 1880, 5373, 104118, 107047, 107736, 608, 43262, 5373, 110760, 78882, 27442, 608, 8349, 87146, 299, 69, 10236, 255, 231, 1773, 66394, 99722, 101071, 9370, 101884, 100674, 3837, 100630, 101966, 101071, 5373, 107285, 101071, 5373, 108459, 31548, 33108, 104814, 104238, 1773, 66833, 41932, 39352, 5333, 43589, 98380, 3837, 100630, 104299, 85767, 50007, 5373, 44091, 51154, 5373, 113308, 40090, 107736, 1773, 99553, 104814, 102705, 105866, 3837, 101987, 100007, 57218, 97137, 5373, 44, 18, 10236, 255, 231, 104814, 72448, 102705, 1773, 65577, 32100, 54542, 5373, 71304, 13343, 85767, 5373, 29258, 41321, 104238, 49567, 113308, 78556, 32665, 1773, 100630, 105716, 105853, 33108, 102111, 47872, 90172, 9370, 5333, 85658, 39907, 63276, 414, 330, 5864, 26564, 788, 330, 99722, 101071, 57218, 39352, 5333, 6567, 114, 113, 99758, 34187, 105743, 104814, 5373, 113308, 33108, 39352, 98380, 3837, 43815, 101162, 103967, 3837, 99558, 102074, 99722, 44091, 33108, 72448, 39352, 3837, 46944, 111116, 73670, 100873, 53481, 55338, 78556, 107736, 3837, 106431, 100642, 101348, 17177, 63276, 414, 330, 4648, 31206, 788, 330, 8996, 698, 262, 456, 220, 5133, 532, 522, 76303, 38283, 29, 151645, 198], "loss_mask": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} diff --git a/SpecForge-ext/tests/test_layers/__init__.py b/SpecForge-ext/tests/test_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_layers/test_decoder.py b/SpecForge-ext/tests/test_layers/test_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..aca4f3c2444da6df02f9bbc49568c2d8dc048858 --- /dev/null +++ b/SpecForge-ext/tests/test_layers/test_decoder.py @@ -0,0 +1,251 @@ +import os +import unittest + +import torch +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from torch import nn +from transformers import PretrainedConfig +from yunchang import EXTRACT_FUNC_DICT + +# Project-specific imports +from specforge.distributed import destroy_distributed, init_distributed +from specforge.modeling.draft.llama3_eagle import LlamaDecoderLayer +from specforge.utils import padding +from tests.utils import get_available_port + + +def get_model_config(): + """Create and return the model configuration.""" + config_dict = { + "architectures": ["LlamaForCausalLMEagle3"], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [1, 29, 57], + "use_aux_hidden_state": True, + }, + "bos_token_id": 128000, + "eos_token_id": 128001, + "hidden_act": "silu", + "hidden_size": 7168, + "initializer_range": 0.02, + "intermediate_size": 29568, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": False, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": True, + "rope_scaling": None, + "vocab_size": 129280, + "draft_vocab_size": 32000, + "pretraining_tp": 1, + } + return PretrainedConfig.from_dict(config_dict) + + +def setup_env(rank, world_size, port): + """Set up distributed environment variables.""" + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + torch.cuda.set_device(rank) + + +def run_iterative_pass( + decoder_layer, + embed_tokens, + input_ids, + hidden_states, + attention_mask, + position_ids, + ttt_length, +): + """ + Core loop: execute the forward pass `ttt_length` times. + Used for both Golden (SDPA) and Distributed (USP) runs to ensure logic consistency. + """ + # Clone to avoid side effects on original tensors + curr_input_ids = input_ids.clone() + curr_hidden_states = hidden_states.clone() + + # Init cache + cache_hidden = [[], []] + past_key_values = None + final_output = None + + for idx in range(ttt_length): + is_last = idx == ttt_length - 1 + + # 1. Embed inputs + inputs_embeds = embed_tokens(curr_input_ids).to(curr_hidden_states.dtype) + + # 2. Forward pass + output_hidden_states = decoder_layer( + input_emb=inputs_embeds, + hidden_states=curr_hidden_states, + cache_hidden=cache_hidden, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=False, + use_cache=False, + ) + + # Update states for next iteration + curr_hidden_states = output_hidden_states + final_output = output_hidden_states + + # 3. Simulate TTT padding/shift + if not is_last: + curr_input_ids = padding(curr_input_ids, left=False) + + return final_output + + +def run_test_case(rank, world_size, port): + """Worker function executed in each process.""" + setup_env(rank, world_size, port) + device = torch.device(f"cuda:{rank}") + set_seed(42) + + # --- Data & Config Preparation --- + config = get_model_config() + seq_len = 1560 + batch_size = 1 + ttt_length = 3 + + # Generate dummy data on GPU + data_input_ids = torch.randint(0, 10000, (batch_size, seq_len), device=device) + data_hidden_states = torch.randn( + batch_size, seq_len, config.hidden_size, device=device, dtype=torch.bfloat16 + ) + attention_mask = torch.tril(torch.ones(seq_len, seq_len, device=device)).view( + 1, 1, seq_len, seq_len + ) + position_ids = torch.arange(seq_len, device=device).unsqueeze(0) + + # Shared embedding layer + embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, config.pad_token_id + ).to(device) + + # --- Phase 1: Golden Run (SDPA) --- + # Init dist briefly for internal checks, even if running single-device logic + init_distributed(tp_size=1, sp_ulysses_size=1, sp_ring_size=1) + + sdpa_decoder = ( + LlamaDecoderLayer(config, attention_backend="fa").to(device).to(torch.bfloat16) + ) + + with torch.no_grad(): + sdpa_output = run_iterative_pass( + decoder_layer=sdpa_decoder, + embed_tokens=embed_tokens, + input_ids=data_input_ids, + hidden_states=data_hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + ttt_length=ttt_length, + ) + + # Save weights for alignment and cleanup SDPA model + state_dict = sdpa_decoder.state_dict() + del sdpa_decoder + destroy_distributed() + + # --- Phase 2: Distributed Run (USP) --- + def subtest_usp(sp_ulysses_degree, sp_ring_degree): + """Run USP with specific topology and compare against Golden.""" + try: + init_distributed( + tp_size=1, + sp_ulysses_size=sp_ulysses_degree, + sp_ring_size=sp_ring_degree, + ) + + # Init USP model and load golden weights + usp_decoder = ( + LlamaDecoderLayer(config, attention_backend="usp") + .to(device) + .to(torch.bfloat16) + ) + usp_decoder.load_state_dict(state_dict) + + # Shard data (Split Input) + extract_func = EXTRACT_FUNC_DICT["basic"] + + local_input_ids = ( + extract_func( + data_input_ids, + rank, + world_size=world_size, + rd=sp_ring_degree, + ud=sp_ulysses_degree, + ) + .detach() + .clone() + ) + + local_hidden_states = ( + extract_func( + data_hidden_states, + rank, + world_size=world_size, + rd=sp_ring_degree, + ud=sp_ulysses_degree, + ) + .detach() + .clone() + ) + + # Run USP forward + with torch.no_grad(): + usp_output = run_iterative_pass( + decoder_layer=usp_decoder, + embed_tokens=embed_tokens, + input_ids=local_input_ids, + hidden_states=local_hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + ttt_length=ttt_length, + ) + + # Verify results + # Slice the golden output to match the current rank's chunk + total_degree = sp_ring_degree * sp_ulysses_degree + chunk_size = sdpa_output.shape[1] // total_degree + start_idx = (rank % total_degree) * chunk_size + end_idx = start_idx + chunk_size + + golden_chunk = sdpa_output[:, start_idx:end_idx, :] + + assert torch.allclose(usp_output, golden_chunk, rtol=2e-2, atol=2e-2), ( + f"[Rank {rank}] USP (U{sp_ulysses_degree}R{sp_ring_degree}) mismatch!\n" + f"Max Diff: {(usp_output - golden_chunk).abs().max().item()}" + ) + + finally: + destroy_distributed() + + # Case 1: Hybrid (Ulysses=2, Ring=1) + subtest_usp(sp_ulysses_degree=2, sp_ring_degree=1) + + # Case 2: Hybrid (Ulysses=1, Ring=2) + subtest_usp(sp_ulysses_degree=1, sp_ring_degree=2) + + +class TestTTTDistributed(unittest.TestCase): + def test_llama_usp_decoder(self): + world_size = 2 + port = get_available_port() + mp.spawn(run_test_case, nprocs=world_size, args=(world_size, port)) + + +if __name__ == "__main__": + unittest.main() diff --git a/SpecForge-ext/tests/test_layers/test_embedding.py b/SpecForge-ext/tests/test_layers/test_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..fc16cbcbbc95cda35d68be50797e12c5b3a4c9de --- /dev/null +++ b/SpecForge-ext/tests/test_layers/test_embedding.py @@ -0,0 +1,75 @@ +import os +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed + +from specforge.distributed import init_distributed +from specforge.layers import VocabParallelEmbedding +from tests.utils import get_available_port + + +def run_embedding(rank, world_size, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + init_distributed(tp_size=world_size) + set_seed(42) + + # =============================== + # Case 1: vocab size is divisible by the TP size + # =============================== + # create layers + data = torch.randint(0, 512, (1, 128)).long().cuda() + native_embedding = torch.nn.Embedding(512, 256, padding_idx=314).cuda() + sf_embedding = VocabParallelEmbedding(512, 256, padding_idx=314).cuda() + sf_embedding.load_state_dict(native_embedding.state_dict()) + + # forward + native_output = native_embedding(data) + sf_output = sf_embedding(data) + + # check + assert torch.allclose( + native_output, sf_output, rtol=1e-5, atol=1e-5 + ), f"native_output: \n{native_output}, \nsf_output: \n{sf_output}" + + # =============================== + # Case 2: vocab size is NOT divisible by the TP size + # =============================== + # create layers + data = torch.randint(0, 355, (1, 128)).long().cuda() + native_embedding = torch.nn.Embedding(355, 256, padding_idx=314).cuda() + sf_embedding = VocabParallelEmbedding(355, 256, padding_idx=314).cuda() + sf_embedding.load_state_dict(native_embedding.state_dict()) + + # forward + native_output = native_embedding(data) + sf_output = sf_embedding(data) + + # check + assert torch.allclose( + native_output, sf_output, rtol=1e-5, atol=1e-5 + ), f"native_output: \n{native_output}, \nsf_output: \n{sf_output}" + + dist.destroy_process_group() + + +class TestEmbedding(unittest.TestCase): + + def test_embedding(self): + port = get_available_port() + mp.spawn(run_embedding, nprocs=2, args=(2, port)) + + port = get_available_port() + mp.spawn(run_embedding, nprocs=1, args=(1, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestEmbedding)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_layers/test_linear.py b/SpecForge-ext/tests/test_layers/test_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..5ec5706958c6a03fe341f142f9139084583a1b66 --- /dev/null +++ b/SpecForge-ext/tests/test_layers/test_linear.py @@ -0,0 +1,147 @@ +import os +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed + +from specforge.distributed import gather_tensor, get_tp_group, init_distributed +from specforge.layers import ColumnParallelLinear, RowParallelLinear +from tests.utils import get_available_port + + +def run_column_parallel_linear(rank, world_size, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + init_distributed(tp_size=world_size) + set_seed(42) + + # =============================== + # Case 1: normal layout + # =============================== + # create data + data = torch.rand(1, 256).cuda() + + # create layers + native_linear = torch.nn.Linear(256, 512).cuda() + sf_linear = ColumnParallelLinear(256, 512, layout_type="normal").cuda() + sf_linear.load_state_dict(native_linear.state_dict()) + + # forward + native_output = native_linear(data) + sf_output = sf_linear(data) + full_sf_output = gather_tensor(sf_output, get_tp_group()) + + # check + assert torch.allclose( + native_output, full_sf_output, rtol=1e-5, atol=1e-5 + ), f"native_output: \n{native_output}, \nsf_output: \n{sf_output}" + + # =============================== + # Case 2: merged QKV layout + # =============================== + # create data + data = torch.rand(1, 256 * 3).cuda() + + # create layers + native_linear = torch.nn.Linear(256 * 3, 512).cuda() + sf_linear = ColumnParallelLinear(256 * 3, 512, layout_type="merged_qkv").cuda() + sf_linear.load_state_dict(native_linear.state_dict()) + + # forward + q, k, v = native_linear(data).chunk(3, dim=1) + sf_q, sf_k, sf_v = sf_linear(data).chunk(3, dim=1) + full_sf_q = gather_tensor(sf_q, get_tp_group()) + full_sf_k = gather_tensor(sf_k, get_tp_group()) + full_sf_v = gather_tensor(sf_v, get_tp_group()) + + # check + assert torch.allclose( + q, full_sf_q, rtol=1e-5, atol=1e-5 + ), f"q: \n{q}, \nfull_sf_q: \n{full_sf_q}" + assert torch.allclose( + k, full_sf_k, rtol=1e-5, atol=1e-5 + ), f"k: \n{k}, \nfull_sf_k: \n{full_sf_k}" + assert torch.allclose( + v, full_sf_v, rtol=1e-5, atol=1e-5 + ), f"v: \n{v}, \nfull_sf_v: \n{full_sf_v}" + + # =============================== + # Case 3: gate_up layout + # =============================== + # create data + data = torch.rand(1, 256 * 2).cuda() + + # create layers + native_linear = torch.nn.Linear(256 * 2, 512).cuda() + sf_linear = ColumnParallelLinear(256 * 2, 512, layout_type="gate_up").cuda() + sf_linear.load_state_dict(native_linear.state_dict()) + + # forward + gate, up = native_linear(data).chunk(2, dim=1) + sf_gate, sf_up = sf_linear(data).chunk(2, dim=1) + full_sf_gate = gather_tensor(sf_gate, get_tp_group()) + full_sf_up = gather_tensor(sf_up, get_tp_group()) + + # check + assert torch.allclose( + gate, full_sf_gate, rtol=1e-5, atol=1e-5 + ), f"gate: \n{gate}, \nfull_sf_gate: \n{full_sf_gate}" + assert torch.allclose( + up, full_sf_up, rtol=1e-5, atol=1e-5 + ), f"up: \n{up}, \nfull_sf_up: \n{full_sf_up}" + + dist.destroy_process_group() + + +def run_row_parallel_linear(rank, world_size, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + init_distributed(tp_size=world_size) + set_seed(42) + + # =============================== + # Case 1: normal layout + # the data in an parallel input, i.g. + # Y = AllReduce(X_i * W_i) + # =============================== + # create data + data = torch.rand(1, 256).cuda() + + # create layers + native_linear = torch.nn.Linear(256, 512).cuda() + sf_linear = RowParallelLinear(256, 512, layout_type="normal").cuda() + sf_linear.load_state_dict(native_linear.state_dict()) + + # forward + native_output = native_linear(data) + sf_output = sf_linear(data.chunk(world_size, dim=0)[rank]) + dist.all_reduce(sf_output, op=dist.ReduceOp.SUM, group=get_tp_group()) + + # check + assert torch.allclose( + native_output, sf_output, rtol=1e-5, atol=1e-5 + ), f"native_output: \n{native_output}, \nfull_sf_output: \n{full_sf_output}" + + +class TestLinear(unittest.TestCase): + + def test_column_parallel_linear(self): + port = get_available_port() + mp.spawn(run_column_parallel_linear, nprocs=2, args=(2, port)) + + def test_column_parallel_linear(self): + port = get_available_port() + mp.spawn(run_column_parallel_linear, nprocs=1, args=(1, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestLinear)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_layers/test_lm_head.py b/SpecForge-ext/tests/test_layers/test_lm_head.py new file mode 100644 index 0000000000000000000000000000000000000000..948ae5fdcc74108575c64a1a4393a330429dbb77 --- /dev/null +++ b/SpecForge-ext/tests/test_layers/test_lm_head.py @@ -0,0 +1,108 @@ +import os +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed + +from specforge.distributed import init_distributed +from specforge.layers import ParallelLMHead, VocabParallelEmbedding +from tests.utils import get_available_port + + +def run_lm_head(rank, world_size, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + init_distributed(tp_size=world_size) + set_seed(42) + + # =============================== + # Case 1: the output vocab size is divisible by the TP size + # =============================== + # create data + data = torch.rand(1, 128, 256).cuda() + + for bias in [True, False]: + # create layers + native_lm_head = torch.nn.Linear(256, 512, bias=bias).cuda() + sf_lm_head = ParallelLMHead(256, 512, bias=bias).cuda() + sf_lm_head.load_state_dict(native_lm_head.state_dict()) + + # forward + native_output = native_lm_head(data) + sf_output = sf_lm_head(data, gather_output=True) + + # check + assert torch.allclose( + native_output, sf_output, rtol=1e-5, atol=1e-5 + ), f"bias: {bias}, native_output: \n{native_output}, \nsf_output: \n{sf_output}" + + # =============================== + # Case 2: the output vocab size is not divisible by the TP size + # =============================== + # create data + data = torch.rand(1, 128, 256).cuda() + + # create layers + native_lm_head = torch.nn.Linear(256, 377, bias=bias).cuda() + sf_lm_head = ParallelLMHead(256, 377, bias=bias).cuda() + sf_lm_head.load_state_dict(native_lm_head.state_dict()) + + # forward + native_output = native_lm_head(data) + sf_output = sf_lm_head(data, gather_output=True) + + # check + assert torch.allclose( + native_output, sf_output, rtol=1e-5, atol=1e-5 + ), f"bias: {bias}, native_output: \n{native_output}, \nsf_output: \n{sf_output}" + + # =============================== + # Case 3: tie word embedding + # =============================== + if not bias: + # there is no bias in the embedding layer so we skip when bias is True + # create data + data = torch.rand(128, 256).cuda() + + # create native layers + native_embedding = torch.nn.Embedding(512, 256).cuda() + native_lm_head = torch.nn.Linear(256, 512, bias=bias).cuda() + native_lm_head.weight = native_embedding.weight + + # create specforge layers + sf_embedding = VocabParallelEmbedding(512, 256).cuda() + sf_embedding.load_state_dict(native_embedding.state_dict()) + sf_lm_head = ParallelLMHead(256, 512, bias=bias).cuda() + sf_lm_head.weight = sf_embedding.weight + + # forward + native_output = native_lm_head(data) + sf_output = sf_lm_head(data, gather_output=True) + + # check + assert torch.allclose( + native_output, sf_output, rtol=1e-5, atol=1e-5 + ), f"bias: {bias}, native_output: \n{native_output}, \nsf_output: \n{sf_output}" + + dist.destroy_process_group() + + +class TestLMHead(unittest.TestCase): + + def test_lm_head(self): + port = get_available_port() + mp.spawn(run_lm_head, nprocs=2, args=(2, port)) + + port = get_available_port() + mp.spawn(run_lm_head, nprocs=1, args=(1, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestLMHead)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/__init__.py b/SpecForge-ext/tests/test_modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_modeling/test_auto_model.py b/SpecForge-ext/tests/test_modeling/test_auto_model.py new file mode 100644 index 0000000000000000000000000000000000000000..35a5caa84c4fedd86fb14713c22659bc40213998 --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_auto_model.py @@ -0,0 +1,17 @@ +import unittest + +from specforge.modeling.auto import AutoEagle3DraftModel, LlamaForCausalLMEagle3 + + +class TestAutoModelForCausalLM(unittest.TestCase): + + def test_automodel(self): + """init""" + model = AutoEagle3DraftModel.from_pretrained( + "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B" + ) + self.assertIsInstance(model, LlamaForCausalLMEagle3) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/SpecForge-ext/tests/test_modeling/test_draft/__init__.py b/SpecForge-ext/tests/test_modeling/test_draft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_modeling/test_draft/test_llama3.py b/SpecForge-ext/tests/test_modeling/test_draft/test_llama3.py new file mode 100644 index 0000000000000000000000000000000000000000..b0fa86c80b0b216a7546bca523ee8546f7a16d1f --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_draft/test_llama3.py @@ -0,0 +1,137 @@ +import os +import shutil +import tempfile +import unittest +from unittest.mock import patch + +import torch +from transformers import LlamaConfig + +from specforge.modeling.draft.llama3_eagle import ( + LlamaAttention, + LlamaForCausalLMEagle3, + LlamaMLP, + LlamaRMSNorm, +) + +# from model_module import LlamaForCausalLMEagle3 + + +class TestLlamaForCausalLMEagle3Loading(unittest.TestCase): + + def setUp(self): + """Set up the test environment before each test.""" + self.temp_dir = tempfile.mkdtemp() + + config_dict = { + "architectures": ["LlamaForCausalLM"], + "bos_token_id": 128000, + "eos_token_id": 128001, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 2048, + "model_type": "llama", + "num_attention_heads": 32, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": False, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": True, + "vocab_size": 128256, + "draft_vocab_size": 32000, + } + + self.config = LlamaConfig(**config_dict) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_model_initialization(self): + model = LlamaForCausalLMEagle3(self.config) + + self.assertIsInstance(model.midlayer.self_attn, LlamaAttention) + self.assertIsInstance(model.midlayer.mlp, LlamaMLP) + self.assertIsInstance(model.midlayer.hidden_norm, LlamaRMSNorm) + self.assertIsInstance(model.midlayer.input_layernorm, LlamaRMSNorm) + self.assertIsInstance(model.midlayer.post_attention_layernorm, LlamaRMSNorm) + self.assertEqual(model.midlayer.hidden_size, self.config.hidden_size) + + def test_save_pretrained(self): + """Test the model's save_pretrained functionality.""" + model = LlamaForCausalLMEagle3(self.config) + + self.config.save_pretrained(self.temp_dir) + + model_path = os.path.join(self.temp_dir, "pytorch_model.bin") + torch.save(model.state_dict(), model_path) + + self.assertTrue(os.path.exists(os.path.join(self.temp_dir, "config.json"))) + self.assertTrue(os.path.exists(model_path)) + + @patch("transformers.modeling_utils.PreTrainedModel.from_pretrained") + def test_from_pretrained_mock(self, mock_from_pretrained): + """mock""" + mock_model = LlamaForCausalLMEagle3(self.config) + mock_from_pretrained.return_value = mock_model + + loaded_model = LlamaForCausalLMEagle3.from_pretrained(self.temp_dir) + mock_from_pretrained.assert_called_once_with(self.temp_dir) + self.assertIsInstance(loaded_model, LlamaForCausalLMEagle3) + + def test_model_forward_pass(self): + """forward""" + model = LlamaForCausalLMEagle3(self.config) + model.eval() + + batch_size = 2 + seq_len = 10 + + input_emb = torch.randn(batch_size, seq_len, self.config.hidden_size) + hidden_states = torch.randn(batch_size, seq_len, self.config.hidden_size * 3) + attention_mask = torch.ones(batch_size, seq_len) + + with torch.no_grad(): + outputs = model( + inputs_embeds=input_emb, + hidden_states=hidden_states, + attention_mask=attention_mask, + ) + + self.assertEqual(outputs.shape, (batch_size, seq_len, self.config.hidden_size)) + + def test_state_dict_compatibility(self): + model1 = LlamaForCausalLMEagle3(self.config) + model2 = LlamaForCausalLMEagle3(self.config) + + state_dict = model1.state_dict() + + model2.load_state_dict(state_dict) + + for name, param1 in model1.named_parameters(): + param2 = dict(model2.named_parameters())[name] + self.assertTrue(torch.equal(param1, param2)) + + def test_config_validation(self): + invalid_config = LlamaConfig( + vocab_size=1000, + hidden_size=127, + num_attention_heads=4, + num_key_value_heads=2, + ) + + with self.assertRaises(AttributeError): + LlamaForCausalLMEagle3(invalid_config) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + + suite.addTest(unittest.makeSuite(TestLlamaForCausalLMEagle3Loading)) + + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/__init__.py b/SpecForge-ext/tests/test_modeling/test_target/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/__init__.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_gpt_oss.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_gpt_oss.py new file mode 100644 index 0000000000000000000000000000000000000000..c62521b5dc06791a06af82be84673f6d3dc1e6bd --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_gpt_oss.py @@ -0,0 +1,107 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers import GptOssConfig, GptOssForCausalLM + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.gpt_oss import ( + GptOssForCausalLM as DistGptOssForCausalLM, +) +from tests.utils import get_available_port + + +def test_gpt_oss_tp(rank, world_size, temp_dir, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=2) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = GptOssConfig( + vocab_size=1000, + hidden_size=384, + intermediate_size=512, + intermediate_size_mlp=512, + num_hidden_layers=2, + max_position_embeddings=1024, + num_attention_heads=8, + num_key_value_heads=2, + head_dim=64, + num_local_experts=4, + tie_word_embeddings=tie_word_embeddings, + initializer_range=0.02, + hidden_act="silu", + layer_types=[ + "sliding_attention", + "full_attention", + ], + ) + + # create the single-gpu + model = GptOssForCausalLM(config).cuda().eval() + + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + + # # load the model weights to the distributed model + print(f"Loading model from {temp_dir}") + dist_model = DistGptOssForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-5, + atol=1e-5, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestGptOssTP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_gpt_oss_tp(self): + port = get_available_port() + mp.spawn(test_gpt_oss_tp, nprocs=2, args=(2, self.temp_dir.name, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestGptOssTP)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_llama4_tp.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_llama4_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..1cea91f3b91d5de42f312b11351d1c05815b4632 --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_llama4_tp.py @@ -0,0 +1,104 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers import Llama4ForCausalLM as HFLlama4ForCausalLM +from transformers import Llama4TextConfig + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.llama4 import ( + Llama4ForCausalLM as SFLlama4ForCausalLM, +) +from tests.utils import get_available_port + + +def test_llama4_tp(rank, world_size, temp_dir, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=world_size) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = Llama4TextConfig( + vocab_size=1000, + hidden_size=384, + intermediate_size=512, + intermediate_size_mlp=512, + num_hidden_layers=2, + max_position_embeddings=1024, + num_attention_heads=10, + num_key_value_heads=2, + head_dim=64, + num_local_experts=4, + tie_word_embeddings=tie_word_embeddings, + initializer_range=0.02, + hidden_act="silu", + ) + + # create the single-gpu + model = HFLlama4ForCausalLM(config).cuda() + + # save the model weights to a temp directory + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + + # load the model weights to the distributed model + dist_model = SFLlama4ForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-5, + atol=1e-5, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestLlama4TP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_llama4_tp(self): + port = get_available_port() + mp.spawn(test_llama4_tp, nprocs=2, args=(2, self.temp_dir.name, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestLlama4TP)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_llama_tp.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_llama_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..857c002dee178f69005e311e8d168b98f9912db2 --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_llama_tp.py @@ -0,0 +1,100 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers import LlamaConfig +from transformers import LlamaForCausalLM as HFLLamaForCausalLM + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.llama import ( + LlamaForCausalLM as SFLlamaForCausalLM, +) +from tests.utils import get_available_port + + +def test_llama3_tp(rank, world_size, temp_dir, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=2) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = LlamaConfig( + vocab_size=1000, + hidden_size=384, + intermediate_size=512, + num_hidden_layers=2, + max_position_embeddings=1024, + num_attention_heads=10, + num_key_value_heads=2, + initializer_range=0.02, + hidden_act="silu", + rms_norm_eps=1e-6, + tie_word_embeddings=tie_word_embeddings, + ) + + # create the single-gpu + model = HFLLamaForCausalLM(config).cuda() + + # save the model weights to a temp directory + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + dist_model = SFLlamaForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-5, + atol=1e-5, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestLlama3TP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_llama3_tp(self): + port = get_available_port() + mp.spawn(test_llama3_tp, nprocs=2, args=(2, self.temp_dir.name, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestLlama3TP)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_phi3_tp.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_phi3_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..ca9791d9d07ce4e369dd8570cb30a456dea038e0 --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_phi3_tp.py @@ -0,0 +1,106 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers.models.phi3 import Phi3Config +from transformers.models.phi3 import Phi3ForCausalLM as HFPhi3ForCausalLM + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.phi3 import ( + Phi3ForCausalLM as SFLPhi3ForCausalLM, +) +from tests.utils import get_available_port + + +def test_phi3_tp(rank, world_size, temp_dir, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=2) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = Phi3Config( + vocab_size=32064, + hidden_size=3072, + intermediate_size=8192, + num_hidden_layers=2, + max_position_embeddings=4096, + num_attention_heads=32, + num_key_value_heads=32, + tie_word_embeddings=tie_word_embeddings, + hidden_act="silu", + rms_norm_eps=1e-6, + attention_dropout=0.0, + resid_pdrop=0.0, + ) + + # create a simple single-gpu model + model = HFPhi3ForCausalLM(config).cuda() + + # save the model weights to a temp directory + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + + # load the model weights to the distributed model + dist_model = SFLPhi3ForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + # Run inference on both models + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-4, + atol=1e-4, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestPhi3TP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_phi3_tp(self): + port = get_available_port() + mp.spawn(test_phi3_tp, nprocs=2, args=(2, self.temp_dir.name, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + + suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestPhi3TP)) + + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen2_tp.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen2_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..4581c3044f1f5d61c97f0789289c02ee424ad4fb --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen2_tp.py @@ -0,0 +1,105 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers import Qwen2Config +from transformers import Qwen2ForCausalLM as HFWen2ForCausalLM + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.qwen2 import ( + Qwen2ForCausalLM as SFLQwen2ForCausalLM, +) +from tests.utils import get_available_port + + +def test_qwen2_tp(rank, world_size, temp_dir, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=2) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = Qwen2Config( + vocab_size=1000, + hidden_size=384, + intermediate_size=512, + intermediate_size_mlp=512, + num_hidden_layers=2, + max_position_embeddings=1024, + num_attention_heads=10, + num_key_value_heads=2, + head_dim=64, + num_local_experts=4, + tie_word_embeddings=tie_word_embeddings, + initializer_range=0.02, + hidden_act="silu", + ) + + # create the single-gpu + model = HFWen2ForCausalLM(config).cuda() + + # save the model weights to a temp directory + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + + # load the model weights to the distributed model + print(f"Loading model from {temp_dir}") + dist_model = SFLQwen2ForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-5, + atol=1e-5, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestQwen2TP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_qwen2_tp(self): + port = get_available_port() + mp.spawn(test_qwen2_tp, nprocs=2, args=(2, self.temp_dir.name, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestQwen2TP)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen3_moe_tp.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen3_moe_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..8f5f6450b269c9d39d396bff1cb8e935e52b4d1b --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen3_moe_tp.py @@ -0,0 +1,112 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers.models.qwen3_moe import Qwen3MoeConfig +from transformers.models.qwen3_moe import Qwen3MoeForCausalLM as HFWen3MoeForCausalLM + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.qwen3_moe import ( + Qwen3MoeForCausalLM as SFLQwen3MoeForCausalLM, +) +from tests.utils import get_available_port + + +def test_qwen3_moe_tp(rank, world_size, temp_dir, port, num_heads, num_kv_heads): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=2) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = Qwen3MoeConfig( + vocab_size=1000, + hidden_size=384, + intermediate_size=512, + moe_intermediate_size=512, + num_hidden_layers=2, + max_position_embeddings=1024, + num_attention_heads=num_heads, + num_key_value_heads=num_kv_heads, + num_experts=64, + num_experts_per_tok=8, + hidden_act="silu", + rms_norm_eps=1e-6, + tie_word_embeddings=tie_word_embeddings, + ) + + # create a simple single-gpu model + model = HFWen3MoeForCausalLM(config).cuda() + + # save the model weights to a temp directory + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + + # load the model weights to the distributed model + print(f"Loading model from {temp_dir}") + dist_model = SFLQwen3MoeForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-5, + atol=1e-5, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestQwen3MoeTP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_qwen3_moe_tp_no_kv_head_replicas(self): + # Set to 2 as only 2 GPU avaialble in CI + port = get_available_port() + mp.spawn(test_qwen3_moe_tp, nprocs=2, args=(2, self.temp_dir.name, port, 8, 4)) + + def test_qwen3_moe_tp_kv_head_replicas(self): + port = get_available_port() + mp.spawn(test_qwen3_moe_tp, nprocs=2, args=(2, self.temp_dir.name, port, 8, 1)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + + suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestQwen3MoeTP)) + + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen3_tp.py b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen3_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..eb447ec4567eca8d39b760e2542f70596c64b20b --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_custom_backend/test_qwen3_tp.py @@ -0,0 +1,106 @@ +import os +import tempfile +import unittest + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from accelerate.utils import set_seed +from transformers.models.qwen3 import Qwen3Config +from transformers.models.qwen3 import Qwen3ForCausalLM as HFQwen3ForCausalLM + +from specforge.distributed import init_distributed +from specforge.modeling.target.custom_backend.qwen3 import ( + Qwen3ForCausalLM as SFLQwen3ForCausalLM, +) +from tests.utils import get_available_port + + +def test_qwen3_tp(rank, world_size, temp_dir, port): + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=2) + set_seed(42) + + for tie_word_embeddings in [True, False]: + config = Qwen3Config( + vocab_size=1000, + hidden_size=384, + intermediate_size=512, + moe_intermediate_size=512, + num_hidden_layers=2, + max_position_embeddings=1024, + num_attention_heads=8, + num_key_value_heads=4, + hidden_act="silu", + rms_norm_eps=1e-6, + tie_word_embeddings=tie_word_embeddings, + ) + + # create a simple single-gpu model + model = HFQwen3ForCausalLM(config).cuda() + + # save the model weights to a temp directory + if dist.get_rank() == 0: + model.save_pretrained(temp_dir) + print(f"Saved model to {temp_dir}") + dist.barrier() + + # load the model weights to the distributed model + print(f"Loading model from {temp_dir}") + dist_model = SFLQwen3ForCausalLM.from_pretrained(temp_dir).cuda() + dist.barrier() + + if tie_word_embeddings: + assert torch.equal( + model.get_input_embeddings().weight, model.lm_head.weight + ) + assert torch.equal( + dist_model.get_input_embeddings().weight, dist_model.lm_head.weight + ) + + # create data + input_ids = torch.randint(0, 1000, (1, 256)).cuda() + attention_mask = torch.ones_like(input_ids).cuda() + + expected_logits = model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + dist_logits = dist_model( + input_ids=input_ids, attention_mask=attention_mask + ).logits + + assert torch.allclose( + expected_logits, + dist_logits, + rtol=1e-5, + atol=1e-5, + ), f"Logits are not close, {expected_logits} vs {dist_logits}" + + dist.destroy_process_group() + + +class TestQwen3TP(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_qwen3_tp(self): + # Set to 2 as only 2 GPU avaialble in CI + port = get_available_port() + mp.spawn(test_qwen3_tp, nprocs=2, args=(2, self.temp_dir.name, port)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + + suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestQwen3TP)) + + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_sglang_backend/__init__.py b/SpecForge-ext/tests/test_modeling/test_target/test_sglang_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py b/SpecForge-ext/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..ce7396fc88dbc77f22e7a9d9036f659a8bb57d59 --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py @@ -0,0 +1,430 @@ +import os +import unittest + +import torch +import torch.multiprocessing as mp +from accelerate.utils import set_seed + +from specforge.distributed import init_distributed +from specforge.modeling.target.eagle3_target_model import SGLangEagle3TargetModel +from tests.utils import get_available_port + + +@torch.no_grad() +def test_dense(rank, world_size, port, tp_size): + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=tp_size) + set_seed(42) + + input_ids = torch.randint(0, 1000, (2, 256)).cuda() + attention_mask = torch.ones_like(input_ids) + loss_mask = torch.ones_like(input_ids) + + # test dense model + sgl_target_model = SGLangEagle3TargetModel.from_pretrained( + "unsloth/Llama-3.2-1B", + torch_dtype=torch.float16, + device="cuda", + attention_backend="fa3", + mem_fraction_static=0.4, + enable_torch_compile=True, + enable_nccl_nvls=True, + enable_symm_mem=True, + enable_dp_attention=False, + enable_dp_lm_head=False, + enable_piecewise_cuda_graph=True, + ep_size=1, + context_length=256, + ) + sgl_target_model.set_aux_hidden_states_layers() + sgl_out = sgl_target_model.generate_eagle3_data( + input_ids=input_ids, attention_mask=attention_mask, loss_mask=loss_mask + ) + + +@torch.no_grad() +def test_moe(rank, world_size, port, tp_size): + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=tp_size) + set_seed(42) + + input_ids = torch.randint(0, 1000, (2, 256)).cuda() + attention_mask = torch.ones_like(input_ids) + loss_mask = torch.ones_like(input_ids) + + # test moe model + sgl_target_model = SGLangEagle3TargetModel.from_pretrained( + "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", + torch_dtype=torch.float16, + device="cuda", + attention_backend="fa3", + mem_fraction_static=0.4, + enable_torch_compile=True, + enable_nccl_nvls=True, + enable_symm_mem=True, + enable_dp_attention=True, + enable_dp_lm_head=True, + enable_piecewise_cuda_graph=True, + ep_size=2, + context_length=256, + ) + sgl_target_model.set_aux_hidden_states_layers() + sgl_out = sgl_target_model.generate_eagle3_data( + input_ids=input_ids, attention_mask=attention_mask, loss_mask=loss_mask + ) + + +def test_vlm(rank, world_size, port, tp_size): + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=tp_size) + set_seed(42) + + # model_path = "Qwen/Qwen2.5-VL-32B-Instruct" + model_path = "Qwen/Qwen2.5-VL-32B-Instruct" + image_path = os.path.join(os.path.dirname(__file__), "images", "demo.jpeg") + + # Use Qwen2.5-VL processor to prepare inputs + from qwen_vl_utils import process_vision_info + from transformers import Qwen2_5_VLProcessor + + processor = Qwen2_5_VLProcessor.from_pretrained(model_path) + + # Create test messages with images (batch_size=2) + # Sample 1: single image + messages_1 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + + # Sample 2: single image (can use same or different image) + messages_2 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + {"type": "text", "text": "What do you see in this picture?"}, + ], + } + ] + + # Process each sample separately to get correct format + batch_input_ids = [] + batch_attention_mask = [] + batch_pixel_values = [] + batch_image_grid_thw = [] + + for messages in [messages_1, messages_2]: + # Apply chat template + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + # Process vision info to get actual image data + image_inputs, video_inputs = process_vision_info(messages) + + # Process with processor + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + batch_input_ids.append(inputs["input_ids"]) + batch_attention_mask.append(inputs["attention_mask"]) + batch_pixel_values.append(inputs["pixel_values"]) + batch_image_grid_thw.append(inputs["image_grid_thw"]) + + # Debug: print shapes + if rank == 0: + print(f"[Debug] batch_input_ids shapes: {[x.shape for x in batch_input_ids]}") + print( + f"[Debug] batch_pixel_values shapes: {[x.shape for x in batch_pixel_values]}" + ) + print( + f"[Debug] batch_image_grid_thw shapes: {[x.shape for x in batch_image_grid_thw]}" + ) + print(f"[Debug] batch_image_grid_thw values: {batch_image_grid_thw}") + # Count image tokens in input_ids + image_token_id = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>") + for i, ids in enumerate(batch_input_ids): + num_img_tokens = (ids == image_token_id).sum().item() + print(f"[Debug] Sample {i}: {num_img_tokens} image tokens in input_ids") + + # Pad input_ids and attention_mask to same length + max_len = max(ids.shape[1] for ids in batch_input_ids) + padded_input_ids = [] + padded_attention_mask = [] + padded_loss_mask = [] + + for input_ids, attention_mask in zip(batch_input_ids, batch_attention_mask): + pad_len = max_len - input_ids.shape[1] + if pad_len > 0: + input_ids = torch.nn.functional.pad( + input_ids, (0, pad_len), value=processor.tokenizer.pad_token_id + ) + attention_mask = torch.nn.functional.pad( + attention_mask, (0, pad_len), value=0 + ) + padded_input_ids.append(input_ids) + padded_attention_mask.append(attention_mask) + padded_loss_mask.append( + attention_mask.clone() + ) # loss_mask same as attention_mask + + # Stack into batches + input_ids = torch.cat(padded_input_ids, dim=0).cuda() + attention_mask = torch.cat(padded_attention_mask, dim=0).cuda() + loss_mask = torch.cat(padded_loss_mask, dim=0).cuda() + + # pixel_values and image_grid_thw remain as lists (one per sample) + pixel_values = torch.cat(batch_pixel_values, dim=0).cuda() + image_grid_thw = [thw.cuda() for thw in batch_image_grid_thw] + + sgl_target_model = SGLangEagle3TargetModel.from_pretrained( + model_path, + torch_dtype=torch.float16, + device="cuda", + attention_backend="fa3", + mem_fraction_static=0.75, + enable_torch_compile=True, + enable_nccl_nvls=False, + enable_symm_mem=False, # Disable to avoid nccl_allocator compilation issues + enable_dp_attention=True, + enable_dp_lm_head=True, + enable_piecewise_cuda_graph=True, + context_length=4096, + ) + sgl_target_model.set_aux_hidden_states_layers() + sgl_out = sgl_target_model.generate_eagle3_data( + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + is_vlm=True, + ) + + if rank == 0: + # Verify output shapes + print(f"[Rank {rank}] hidden_states shape: {sgl_out.hidden_states.shape}") + print(f"[Rank {rank}] target shape: {sgl_out.target.shape}") + print(f"[Rank {rank}] input_ids shape: {sgl_out.input_ids.shape}") + + +def test_vlm_multi_batch(rank, world_size, port, tp_size): + """Test VLM with larger batch size (4 samples) and varying image counts.""" + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=tp_size) + set_seed(42) + + model_path = "Qwen/Qwen2.5-VL-32B-Instruct" + + from qwen_vl_utils import process_vision_info + from transformers import Qwen2_5_VLProcessor + + processor = Qwen2_5_VLProcessor.from_pretrained(model_path) + + image_path = os.path.join(os.path.dirname(__file__), "images", "demo.jpeg") + + # Create test messages with different configurations (batch_size=4) + # Sample 1: single image + messages_1 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + {"type": "text", "text": "Describe this image in detail."}, + ], + } + ] + + # Sample 2: single image with different prompt + messages_2 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + {"type": "text", "text": "What objects can you see in this picture?"}, + ], + } + ] + + # Sample 3: single image with longer prompt + messages_3 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + { + "type": "text", + "text": "Please analyze this image and describe the main subject, background, colors, and any notable details you observe.", + }, + ], + } + ] + + # Sample 4: single image with short prompt + messages_4 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + {"type": "text", "text": "What is this?"}, + ], + } + ] + + all_messages = [messages_1, messages_2, messages_3, messages_4] + batch_size = len(all_messages) + + # Process each sample separately to get correct format + batch_input_ids = [] + batch_attention_mask = [] + batch_pixel_values = [] + batch_image_grid_thw = [] + + for messages in all_messages: + # Apply chat template + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + # Process vision info to get actual image data + image_inputs, video_inputs = process_vision_info(messages) + + # Process with processor + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + batch_input_ids.append(inputs["input_ids"]) + batch_attention_mask.append(inputs["attention_mask"]) + batch_pixel_values.append(inputs["pixel_values"]) + batch_image_grid_thw.append(inputs["image_grid_thw"]) + + # Pad input_ids and attention_mask to same length + max_len = max(ids.shape[1] for ids in batch_input_ids) + padded_input_ids = [] + padded_attention_mask = [] + padded_loss_mask = [] + + for input_ids, attention_mask in zip(batch_input_ids, batch_attention_mask): + pad_len = max_len - input_ids.shape[1] + if pad_len > 0: + input_ids = torch.nn.functional.pad( + input_ids, (0, pad_len), value=processor.tokenizer.pad_token_id + ) + attention_mask = torch.nn.functional.pad( + attention_mask, (0, pad_len), value=0 + ) + padded_input_ids.append(input_ids) + padded_attention_mask.append(attention_mask) + padded_loss_mask.append( + attention_mask.clone() + ) # loss_mask same as attention_mask + + # Stack into batches + input_ids = torch.cat(padded_input_ids, dim=0).cuda() + attention_mask = torch.cat(padded_attention_mask, dim=0).cuda() + loss_mask = torch.cat(padded_loss_mask, dim=0).cuda() + + # pixel_values and image_grid_thw remain as lists (one per sample) + pixel_values = torch.cat(batch_pixel_values, dim=0).cuda() + image_grid_thw = [thw.cuda() for thw in batch_image_grid_thw] + sgl_target_model = SGLangEagle3TargetModel.from_pretrained( + model_path, + torch_dtype=torch.float16, + device="cuda", + attention_backend="fa3", + mem_fraction_static=0.4, + enable_torch_compile=True, + enable_nccl_nvls=False, + enable_symm_mem=False, + enable_dp_attention=True, + enable_dp_lm_head=True, + enable_piecewise_cuda_graph=True, + context_length=4096, + ) + sgl_target_model.set_aux_hidden_states_layers() + sgl_out = sgl_target_model.generate_eagle3_data( + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + is_vlm=True, + ) + + if rank == 0: + # Verify output shapes + print(f"\n{'='*60}") + print(f"[test_vlm_multi_batch] Results:") + print(f"[Rank {rank}] hidden_states shape: {sgl_out.hidden_states.shape}") + print(f"[Rank {rank}] target shape: {sgl_out.target.shape}") + print(f"[Rank {rank}] input_ids shape: {sgl_out.input_ids.shape}") + + # Verify batch dimension matches + assert ( + sgl_out.input_ids.shape[0] == batch_size + ), f"Expected batch_size={batch_size}, got {sgl_out.input_ids.shape[0]}" + print(f"[Rank {rank}] Batch size verification: PASSED") + print(f"{'='*60}\n") + + +class TestTargetModelBackend(unittest.TestCase): + + def test_sglang_backend_with_dense(self): + world_size = 2 + port = get_available_port() + mp.spawn(test_dense, nprocs=world_size, args=(world_size, port, 2)) + + def test_sglang_backend_with_moe(self): + world_size = 2 + port = get_available_port() + mp.spawn(test_moe, nprocs=world_size, args=(world_size, port, 2)) + + def test_sglang_backend_with_vlm(self): + world_size = 2 + port = get_available_port() + mp.spawn(test_vlm, nprocs=world_size, args=(world_size, port, 2)) + + def test_sglang_backend_with_vlm_multi_batch(self): + world_size = 2 + port = get_available_port() + mp.spawn(test_vlm_multi_batch, nprocs=world_size, args=(world_size, port, 2)) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestTargetModelBackend)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_modeling/test_target/test_target_model_backend.py b/SpecForge-ext/tests/test_modeling/test_target/test_target_model_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..232a19813a1493b75dff294d7e7e2b2bb5c27aab --- /dev/null +++ b/SpecForge-ext/tests/test_modeling/test_target/test_target_model_backend.py @@ -0,0 +1,108 @@ +import os +import unittest + +import torch +import torch.multiprocessing as mp +from accelerate.utils import set_seed + +from specforge.distributed import init_distributed +from specforge.modeling.target.eagle3_target_model import ( + CustomEagle3TargetModel, + HFEagle3TargetModel, + SGLangEagle3TargetModel, +) +from tests.utils import get_available_port + + +@torch.no_grad() +def test_target_model_backend(rank, world_size, port, tp_size): + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + + init_distributed(tp_size=tp_size) + set_seed(42) + + input_ids = torch.randint(0, 1000, (2, 256)).cuda() + attention_mask = torch.ones_like(input_ids) + loss_mask = torch.ones_like(input_ids) + + hf_target_model = HFEagle3TargetModel.from_pretrained( + "unsloth/Llama-3.2-1B", torch_dtype=torch.float16, device="cuda" + ) + hf_target_model.set_aux_hidden_states_layers() + hf_out = hf_target_model.generate_eagle3_data( + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + ) + del hf_target_model + + custom_target_model = CustomEagle3TargetModel.from_pretrained( + "unsloth/Llama-3.2-1B", torch_dtype=torch.float16, device="cuda" + ) + custom_target_model.set_aux_hidden_states_layers() + custom_out = custom_target_model.generate_eagle3_data( + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + ) + del custom_target_model + + # compare weights + assert torch.allclose( + hf_out.target, custom_out.target, atol=1e-5, rtol=1e-5 + ), f"Logits are not close: \nhf: {hf_out[0] - custom_out[0]}" + assert torch.allclose( + hf_out.loss_mask, custom_out.loss_mask, atol=1e-5, rtol=1e-5 + ), f"Logits are not close: \ndiff: {hf_out[1] - custom_out[1]}" + assert torch.allclose( + hf_out.input_ids, custom_out.input_ids, atol=1e-5, rtol=1e-5 + ), f"Logits are not close: \ndiff: {hf_out[1] - custom_out[1]}" + assert torch.allclose( + hf_out.hidden_states, custom_out.hidden_states, atol=1e-5, rtol=1e-5 + ), f"Logits are not close: \ndiff: {hf_out[1] - custom_out[1]}" + + sgl_target_model = SGLangEagle3TargetModel.from_pretrained( + "unsloth/Llama-3.2-1B", torch_dtype=torch.float16, device="cuda" + ) + sgl_target_model.set_aux_hidden_states_layers() + sgl_out = sgl_target_model.generate_eagle3_data( + input_ids=input_ids, attention_mask=attention_mask, loss_mask=loss_mask + ) + del sgl_target_model + + assert torch.equal(hf_out.loss_mask, sgl_out.loss_mask) + assert torch.equal(hf_out.input_ids, sgl_out.input_ids) + assert torch.allclose( + hf_out.hidden_states, sgl_out.hidden_states, atol=1e-1, rtol=1e-2 + ), f"Hidden states are not close, diff: \n{(hf_out.hidden_states - sgl_out.hidden_states).abs().max()}" + assert torch.allclose( + hf_out.target, sgl_out.target.half(), atol=1e-1, rtol=1e-2 + ), f"Target are not close, diff: \n{(hf_out.target - sgl_out.target).abs().max()}" + + +class TestTargetModelBackend(unittest.TestCase): + + def test_target_model_backend_dp(self): + world_size = 2 + port = get_available_port() + mp.spawn( + test_target_model_backend, nprocs=world_size, args=(world_size, port, 1) + ) + + def test_target_model_backend_tp(self): + world_size = 2 + port = get_available_port() + mp.spawn( + test_target_model_backend, nprocs=world_size, args=(world_size, port, 2) + ) + + +if __name__ == "__main__": + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestTargetModelBackend)) + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) diff --git a/SpecForge-ext/tests/test_scripts/__init__.py b/SpecForge-ext/tests/test_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_scripts/test_prepare_data.py b/SpecForge-ext/tests/test_scripts/test_prepare_data.py new file mode 100644 index 0000000000000000000000000000000000000000..4e9324a010edec2b43dc52ada77f432925684090 --- /dev/null +++ b/SpecForge-ext/tests/test_scripts/test_prepare_data.py @@ -0,0 +1,26 @@ +import unittest +from pathlib import Path + +from sglang.utils import execute_shell_command + +CACHE_DIR = Path(__file__).parent.parent.parent.joinpath("cache") + + +class TestPrepareData(unittest.TestCase): + + def test_prepare_sharegpt(self): + sharegpt_train_path = CACHE_DIR.joinpath("dataset", "sharegpt_train.jsonl") + + if sharegpt_train_path.exists(): + # delete the file + sharegpt_train_path.unlink() + process = execute_shell_command( + "python scripts/prepare_data.py --dataset sharegpt" + ) + process.wait() + self.assertEqual(process.returncode, 0) + self.assertTrue(sharegpt_train_path.exists()) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/SpecForge-ext/tests/test_scripts/test_regenerate_train_data.py b/SpecForge-ext/tests/test_scripts/test_regenerate_train_data.py new file mode 100644 index 0000000000000000000000000000000000000000..9563ebc37d4cea8d800a90927e90b330f56245e4 --- /dev/null +++ b/SpecForge-ext/tests/test_scripts/test_regenerate_train_data.py @@ -0,0 +1,57 @@ +import unittest +from pathlib import Path + +from tests.utils import execute_shell_command, wait_for_server + +CACHE_DIR = Path(__file__).parent.parent.parent.joinpath("cache") + + +class TestRegenerateTrainData(unittest.TestCase): + + def test_regenerate_sharegpt(self): + # prepare data + data_process = execute_shell_command( + "python scripts/prepare_data.py --dataset sharegpt" + ) + data_process.wait() + + # launch sglang + sglang_process = execute_shell_command( + """python3 -m sglang.launch_server \ + --model unsloth/Llama-3.2-1B-Instruct \ + --tp 1 \ + --cuda-graph-bs 4 \ + --dtype bfloat16 \ + --mem-frac=0.8 \ + --port 30000 + """, + disable_proxy=True, + enable_hf_mirror=True, + ) + wait_for_server(f"http://localhost:30000", disable_proxy=True) + + regeneration_process = execute_shell_command( + """python scripts/regenerate_train_data.py \ + --model unsloth/Llama-3.2-1B-Instruct \ + --concurrency 128 \ + --max-tokens 128 \ + --server-address localhost:30000 \ + --temperature 0.8 \ + --input-file-path ./cache/dataset/sharegpt_train.jsonl \ + --output-file-path ./cache/dataset/sharegpt_train_regen.jsonl \ + --num-samples 10 + """, + disable_proxy=True, + enable_hf_mirror=True, + ) + regeneration_process.wait() + self.assertEqual(regeneration_process.returncode, 0) + self.assertTrue( + CACHE_DIR.joinpath("dataset", "sharegpt_train_regen.jsonl").exists() + ) + sglang_process.terminate() + sglang_process.wait() + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/SpecForge-ext/tests/test_scripts/test_train_eagle3.py b/SpecForge-ext/tests/test_scripts/test_train_eagle3.py new file mode 100644 index 0000000000000000000000000000000000000000..a543e1e0080f45e038e56ec5a5fc9483a942858e --- /dev/null +++ b/SpecForge-ext/tests/test_scripts/test_train_eagle3.py @@ -0,0 +1,137 @@ +import shutil +import unittest +from pathlib import Path + +from tests.utils import execute_shell_command + +CACHE_DIR = Path(__file__).parent.parent.parent.joinpath("cache") + + +def replace_in_script(script_path: Path, pattern: str, replacement: str): + with open(script_path, "r") as f: + script = f.readlines() + script = [line.replace(pattern, replacement) for line in script] + with open(script_path, "w") as f: + for line in script: + f.write(line) + + +class TestTrainEagle3(unittest.TestCase): + + def setUp(self) -> None: + # prepare data + data_process = execute_shell_command( + "python scripts/prepare_data.py --dataset sharegpt" + ) + data_process.wait() + + # modify the sccript to only train for 10 steps + # add --max-num-steps 10 to the launch command + script_path = Path(__file__).parent.parent.parent.joinpath( + "examples", "run_llama3.1_8b_eagle3_online.sh" + ) + with open(script_path, "r") as f: + script = f.readlines() + + # remove empty lines + script = [line for line in script if line.strip()] + script[-1] = script[-1].rstrip() + " --max-num-steps 10" + + # replace meta-llama/Llama-3.1-8B-Instruct with unsloth/Llama-3.2-1B-Instruct + # so that we don't need HF token for gated repo + script = [ + line.replace( + "meta-llama/Llama-3.1-8B-Instruct", "nreHieW/Llama-3.1-8B-Instruct" + ) + for line in script + ] + + # write the script back to the file + with open(script_path, "w") as f: + for line in script: + f.write(line) + + def test_online_train_eagle3_with_sglang_backend(self): + # run training + train_process = execute_shell_command( + "bash examples/run_llama3.1_8b_eagle3_online.sh 2" + ) + train_process.wait() + self.assertEqual(train_process.returncode, 0) + + def test_online_train_eagle3_with_hf_backend(self): + # replace --target-model-backend sglang with --target-model-backend hf + script_path = Path(__file__).parent.parent.parent.joinpath( + "examples", "run_llama3.1_8b_eagle3_online.sh" + ) + replace_in_script( + script_path, "--target-model-backend sglang", "--target-model-backend hf" + ) + + # run training + train_process = execute_shell_command( + "bash examples/run_llama3.1_8b_eagle3_online.sh 2" + ) + train_process.wait() + self.assertEqual(train_process.returncode, 0) + + def test_online_train_eagle3_with_custom_backend(self): + # replace --target-model-backend sglang with --target-model-backend custom + script_path = Path(__file__).parent.parent.parent.joinpath( + "examples", "run_llama3.1_8b_eagle3_online.sh" + ) + replace_in_script( + script_path, + "--target-model-backend sglang", + "--target-model-backend custom", + ) + + # run training + train_process = execute_shell_command( + "bash examples/run_llama3.1_8b_eagle3_online.sh 2" + ) + train_process.wait() + self.assertEqual(train_process.returncode, 0) + + def test_offline_train_eagle3(self): + # remove the hidden states if they exist + script_path = Path(__file__).parent.parent.parent.joinpath( + "examples", "run_llama3.1_8b_eagle3_offline.sh" + ) + replace_in_script( + script_path, + "meta-llama/Llama-3.1-8B-Instruct", + "nreHieW/Llama-3.1-8B-Instruct", + ) + replace_in_script( + script_path, + "--batch-size 32", + "--batch-size 5", + ) + replace_in_script( + script_path, + "scripts/prepare_hidden_states.py", + "scripts/prepare_hidden_states.py --num-samples 10", + ) + replace_in_script( + script_path, + "$ROOT_DIR/scripts/train_eagle3.py", + "$ROOT_DIR/scripts/train_eagle3.py --max-num-steps 2", + ) + + hidden_states_path = Path(__file__).parent.parent.parent.joinpath( + "cache", "hidden_states", "sharegpt_train_Llama-3.1-8B-Instruct" + ) + if hidden_states_path.exists(): + # delete the directory + shutil.rmtree(hidden_states_path) + + training_process = execute_shell_command( + "bash examples/run_llama3.1_8b_eagle3_offline.sh 2", + ) + training_process.wait() + self.assertEqual(training_process.returncode, 0) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/SpecForge-ext/tests/test_utils/__init__.py b/SpecForge-ext/tests/test_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_utils/test_flash_attention.py b/SpecForge-ext/tests/test_utils/test_flash_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..7f5aa7dfff4a458839d0aaca9219e2fd94223121 --- /dev/null +++ b/SpecForge-ext/tests/test_utils/test_flash_attention.py @@ -0,0 +1,237 @@ +import unittest + +import torch +import torch.nn.functional as F +from transformers import LlamaConfig + +from specforge.modeling.draft.llama3_eagle import ( + LlamaAttention, + LlamaFlashAttention, + prepare_decoder_attention_mask, +) +from specforge.utils import padding +from tests.test_utils.utils import norm_tensor + +TTT_LENGTH = 7 +torch.manual_seed(0) + + +def assert_similar(ref, out): + # We are looser with the checks since we are comparing bf16 backends + ref = ref.to(torch.float32) + out = out.to(torch.float32) + similarity = F.cosine_similarity(ref.flatten(), out.flatten(), dim=0) + norm_ratio = torch.linalg.norm(ref) / torch.linalg.norm(out) + assert similarity >= 0.975, f"{similarity=}" + assert abs(1 - norm_ratio) <= 0.025, f"{norm_ratio=}" + + +class TestFlashAttention(unittest.TestCase): + + def setUp(self): + torch.manual_seed(0) + self.config_dict = { + "hidden_size": 128, + "num_attention_heads": 8, + "num_key_value_heads": 2, + "max_position_embeddings": 4096, + "rms_norm_eps": 1e-05, + "vocab_size": 32000, + "intermediate_size": 688, + "hidden_act": "silu", + "num_hidden_layers": 1, + "torch_dtype": "bfloat16", + } + self.config = LlamaConfig(**self.config_dict) + + self.seq_lengths = [128, 200, 256, 300, 512, 800, 1024, 2048] + self.dtype = torch.bfloat16 + + def test_forward_pass_comparison(self): + """Test forward pass comparison between LlamaAttention and LlamaFlashAttention.""" + for seq_len in self.seq_lengths: + with self.subTest(seq_len=seq_len): + self._test_forward_pass_comparison_for_seq_len(seq_len) + + def _test_forward_pass_comparison_for_seq_len(self, seq_len): + """Helper method to test forward pass comparison for a specific sequence length.""" + attention = LlamaAttention(self.config).to("cuda").to(self.dtype) + flash_attention = LlamaFlashAttention(self.config).to("cuda").to(self.dtype) + + # Ensure same weights + with torch.no_grad(): + flash_attention.q_proj.weight.copy_(attention.q_proj.weight) + flash_attention.k_proj.weight.copy_(attention.k_proj.weight) + flash_attention.v_proj.weight.copy_(attention.v_proj.weight) + flash_attention.o_proj.weight.copy_(attention.o_proj.weight) + + attention.eval() + flash_attention.eval() + batch_size = 2 + hidden_size = self.config.hidden_size * 2 + + ############### Attention Inputs ############## + + position_ids = ( + torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1).to("cuda") + ) + cache_hidden = [[], []] # [cache_k, cache_v] + flash_cache_hidden = [[], []] # [cache_k, cache_v] + attention_mask = torch.ones(batch_size, seq_len, dtype=self.dtype).to("cuda") + # Simulate one item in the batch is masked and not taking a full block. + padding_start_index = seq_len - min( + 200, seq_len // 3 + ) # Adjust padding based on seq_len + attention_mask[1, padding_start_index:] = False + input_embeds = norm_tensor( + (batch_size, seq_len, self.config.hidden_size), + device="cuda", + dtype=self.dtype, + ) + decoder_attention_mask = prepare_decoder_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_len), + inputs_embeds=input_embeds, + past_key_values_length=0, + ) + hidden_states_list = [] + flash_hidden_states_list = [] + for idx in range(TTT_LENGTH): + hidden_states = norm_tensor( + (batch_size, seq_len, hidden_size), device="cuda", dtype=self.dtype + ) + flash_hidden_states = hidden_states.clone().detach() + hidden_states_list.append(hidden_states) + flash_hidden_states_list.append(flash_hidden_states) + + ############### Flash Attention Inputs ############## + flash_position_ids = position_ids.clone() + for idx in range(TTT_LENGTH): + with torch.no_grad(): + output = attention( + hidden_states=hidden_states_list[idx], + attention_mask=decoder_attention_mask, + position_ids=position_ids, + cache_hidden=cache_hidden, + output_attentions=False, + use_cache=True, + ) + with torch.no_grad(): + output_flash = flash_attention( + hidden_states=flash_hidden_states_list[idx], + position_ids=flash_position_ids, + cache_hidden=flash_cache_hidden, + ) + assert_similar(output[0][: -1 - idx], output_flash[0][: -1 - idx]) + assert_similar( + output[1][: padding_start_index - idx], + output_flash[1][: padding_start_index - idx], + ) + # Check output shape + expected_output_shape = (batch_size, seq_len, self.config.hidden_size) + self.assertEqual(output_flash.shape, expected_output_shape) + # Check output is not NaN or Inf + self.assertFalse(torch.isnan(output_flash).any()) + self.assertFalse(torch.isinf(output_flash).any()) + + def test_backward_pass_gradient_comparison(self): + """Test backward pass comparing gradients between LlamaAttention and LlamaFlashAttention.""" + for seq_len in self.seq_lengths: + with self.subTest(seq_len=seq_len): + self._test_backward_pass_gradient_comparison_for_seq_len(seq_len) + + def _test_backward_pass_gradient_comparison_for_seq_len(self, seq_len): + """Helper method to test backward pass gradient comparison for a specific sequence length.""" + attention = LlamaAttention(self.config).to("cuda").to(self.dtype) + flash_attention = LlamaFlashAttention(self.config).to("cuda").to(self.dtype) + + # Ensure same weights + with torch.no_grad(): + flash_attention.q_proj.weight.copy_(attention.q_proj.weight) + flash_attention.k_proj.weight.copy_(attention.k_proj.weight) + flash_attention.v_proj.weight.copy_(attention.v_proj.weight) + flash_attention.o_proj.weight.copy_(attention.o_proj.weight) + + batch_size = 2 + hidden_size = self.config.hidden_size * 2 + + ############### Attention Inputs ############## + position_ids = ( + torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1).to("cuda") + ) + cache_hidden = [[], []] # [cache_k, cache_v] + flash_cache_hidden = [[], []] # [cache_k, cache_v] + attention_mask = torch.ones(batch_size, seq_len, dtype=torch.bool).to("cuda") + # Simulate one item in the batch is masked and not taking a full block. + # padding_start_index = seq_len - 50 + # attention_mask[1, padding_start_index:] = False + input_embeds = norm_tensor( + (batch_size, seq_len, self.config.hidden_size), + device="cuda", + dtype=self.dtype, + ) + decoder_attention_mask = prepare_decoder_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_len), + inputs_embeds=input_embeds, + past_key_values_length=0, + ) + + ############### Flash Attention Inputs ############## + flash_position_ids = position_ids.clone() + loss_mask = torch.ones( + batch_size, seq_len, dtype=self.dtype, requires_grad=False + ).to("cuda") + + # Create input tensors that require gradients + loss_list = [] + loss_flash_list = [] + hidden_states_list = [] + flash_hidden_states_list = [] + for idx in range(TTT_LENGTH): + hidden_states = norm_tensor( + (batch_size, seq_len, hidden_size), device="cuda", dtype=self.dtype + ) + flash_hidden_states = hidden_states.clone().detach() + hidden_states_list.append(hidden_states) + flash_hidden_states_list.append(flash_hidden_states) + + for idx in range(TTT_LENGTH): + is_last = idx == TTT_LENGTH - 1 + output = attention( + hidden_states=hidden_states_list[idx], + attention_mask=decoder_attention_mask, + position_ids=position_ids, + cache_hidden=cache_hidden, + output_attentions=False, + use_cache=True, + ) + output_flash = flash_attention( + hidden_states=flash_hidden_states_list[idx], + position_ids=flash_position_ids, + cache_hidden=flash_cache_hidden, + ) + # Apply loss mask on calculation over batch + loss = (output * loss_mask[..., None]).sum().mean() + loss_flash = (output_flash * loss_mask[..., None]).sum().mean() + loss_list.append(loss) + loss_flash_list.append(loss_flash) + # Compare gradients + + if not is_last: + # Step 5.7: we need to update the loss mask + loss_mask = padding(loss_mask, left=False) + mean_loss = sum(loss_list) / len(loss_list) + mean_loss_flash = sum(loss_flash_list) / len(loss_flash_list) + mean_loss.backward() + mean_loss_flash.backward() + projections = ["q_proj", "k_proj", "v_proj", "o_proj"] + for proj_name in projections: + assert_similar( + getattr(attention, proj_name).weight.grad, + getattr(flash_attention, proj_name).weight.grad, + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/SpecForge-ext/tests/test_utils/test_flex_attention.py b/SpecForge-ext/tests/test_utils/test_flex_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..f7bc7ce3b809d1f93abe5788519982383b71dd48 --- /dev/null +++ b/SpecForge-ext/tests/test_utils/test_flex_attention.py @@ -0,0 +1,292 @@ +import unittest + +import torch +import torch._dynamo as dynamo +from transformers import LlamaConfig +from transformers.cache_utils import DynamicCache + +from specforge.modeling.draft.flex_attention import ( + compile_friendly_create_block_mask, + compile_friendly_flex_attention, + generate_eagle3_mask, +) +from specforge.modeling.draft.llama3_eagle import ( + LlamaAttention, + LlamaFlexAttention, + prepare_decoder_attention_mask, +) +from specforge.utils import padding + +from .utils import norm_tensor + +dynamo.config.recompile_limit = 64 +TTT_LENGTH = 7 +torch.manual_seed(0) + + +class TestFlexAttention(unittest.TestCase): + + def setUp(self): + torch.manual_seed(0) + self.config_dict = { + "hidden_size": 128, + "num_attention_heads": 8, + "num_key_value_heads": 2, + "max_position_embeddings": 4096, + "rms_norm_eps": 1e-05, + "vocab_size": 32000, + "intermediate_size": 688, + "hidden_act": "silu", + "num_hidden_layers": 1, + "torch_dtype": "float32", + } + self.config = LlamaConfig(**self.config_dict) + + self.seq_lengths = [128, 200, 256, 300, 512, 800, 1024, 2048] + self.dtype = torch.float32 + + def test_forward_pass_comparison(self): + """Test forward pass comparison between LlamaAttention and LlamaFlexAttention.""" + for seq_len in self.seq_lengths: + with self.subTest(seq_len=seq_len): + self._test_forward_pass_comparison_for_seq_len(seq_len) + + def _test_forward_pass_comparison_for_seq_len(self, seq_len): + """Helper method to test forward pass comparison for a specific sequence length.""" + attention = LlamaAttention(self.config).to("cuda").to(self.dtype) + flex_attention = LlamaFlexAttention(self.config).to("cuda").to(self.dtype) + + # Ensure same weights + with torch.no_grad(): + flex_attention.q_proj.weight.copy_(attention.q_proj.weight) + flex_attention.k_proj.weight.copy_(attention.k_proj.weight) + flex_attention.v_proj.weight.copy_(attention.v_proj.weight) + flex_attention.o_proj.weight.copy_(attention.o_proj.weight) + + attention.eval() + flex_attention.eval() + batch_size = 2 + hidden_size = self.config.hidden_size * 2 + + ############### Attention Inputs ############## + + position_ids = ( + torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1).to("cuda") + ) + cache_hidden = [[], []] # [cache_k, cache_v] + attention_mask = torch.ones(batch_size, seq_len, dtype=self.dtype).to("cuda") + # Simulate one item in the batch is masked and not taking a full block. + padding_start_index = seq_len - min( + 200, seq_len // 3 + ) # Adjust padding based on seq_len + attention_mask[1, padding_start_index:] = False + input_embeds = norm_tensor( + (batch_size, seq_len, self.config.hidden_size), + device="cuda", + dtype=self.dtype, + ) + decoder_attention_mask = prepare_decoder_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_len), + inputs_embeds=input_embeds, + past_key_values_length=0, + ) + hidden_states_list = [] + flex_hidden_states_list = [] + for idx in range(TTT_LENGTH): + hidden_states = norm_tensor( + (batch_size, seq_len, hidden_size), device="cuda", dtype=self.dtype + ) + flex_hidden_states = hidden_states.clone().detach() + hidden_states_list.append(hidden_states) + flex_hidden_states_list.append(flex_hidden_states) + + ############### Flex Attention Inputs ############## + flex_position_ids = position_ids.clone() + past_key_values = DynamicCache() + for idx in range(TTT_LENGTH): + is_last = idx == TTT_LENGTH - 1 + with torch.no_grad(): + output = attention( + hidden_states=hidden_states_list[idx], + attention_mask=decoder_attention_mask, + position_ids=position_ids, + cache_hidden=cache_hidden, + output_attentions=False, + use_cache=True, + ) + with torch.no_grad(): + output_flex = flex_attention( + hidden_states=flex_hidden_states_list[idx], + attention_mask=attention_mask, + position_ids=flex_position_ids, + past_key_values=past_key_values, + ) + torch.testing.assert_close( + output[0][: -1 - idx], output_flex[0][: -1 - idx], atol=1e-2, rtol=1e-2 + ) + torch.testing.assert_close( + output[1][: padding_start_index - idx], + output_flex[1][: padding_start_index - idx], + atol=1e-2, + rtol=1e-2, + ) + + # Check output shape + expected_output_shape = (batch_size, seq_len, self.config.hidden_size) + self.assertEqual(output_flex.shape, expected_output_shape) + # Check output is not NaN or Inf + self.assertFalse(torch.isnan(output_flex).any()) + self.assertFalse(torch.isinf(output_flex).any()) + + def test_backward_pass_gradient_comparison(self): + """Test backward pass comparing gradients between LlamaAttention and LlamaFlexAttention.""" + for seq_len in self.seq_lengths: + with self.subTest(seq_len=seq_len): + self._test_backward_pass_gradient_comparison_for_seq_len(seq_len) + + def _test_backward_pass_gradient_comparison_for_seq_len(self, seq_len): + """Helper method to test backward pass gradient comparison for a specific sequence length.""" + attention = LlamaAttention(self.config).to("cuda").to(self.dtype) + flex_attention = LlamaFlexAttention(self.config).to("cuda").to(self.dtype) + + # Ensure same weights + with torch.no_grad(): + flex_attention.q_proj.weight.copy_(attention.q_proj.weight) + flex_attention.k_proj.weight.copy_(attention.k_proj.weight) + flex_attention.v_proj.weight.copy_(attention.v_proj.weight) + flex_attention.o_proj.weight.copy_(attention.o_proj.weight) + + batch_size = 2 + hidden_size = self.config.hidden_size * 2 + + ############### Attention Inputs ############## + position_ids = ( + torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1).to("cuda") + ) + cache_hidden = [[], []] # [cache_k, cache_v] + attention_mask = torch.ones(batch_size, seq_len, dtype=torch.bool).to("cuda") + # Simulate one item in the batch is masked and not taking a full block. + # padding_start_index = seq_len - 50 + # attention_mask[1, padding_start_index:] = False + input_embeds = norm_tensor( + (batch_size, seq_len, self.config.hidden_size), + device="cuda", + dtype=self.dtype, + ) + decoder_attention_mask = prepare_decoder_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_len), + inputs_embeds=input_embeds, + past_key_values_length=0, + ) + + ############### Flex Attention Inputs ############## + flex_position_ids = position_ids.clone() + ttt_length = TTT_LENGTH + past_key_values = DynamicCache() + loss_mask = torch.ones( + batch_size, seq_len, dtype=self.dtype, requires_grad=False + ).to("cuda") + + # Create input tensors that require gradients + loss_list = [] + loss_flex_list = [] + hidden_states_list = [] + flex_hidden_states_list = [] + for idx in range(TTT_LENGTH): + hidden_states = norm_tensor( + (batch_size, seq_len, hidden_size), device="cuda", dtype=self.dtype + ) + flex_hidden_states = hidden_states.clone().detach() + hidden_states_list.append(hidden_states) + flex_hidden_states_list.append(flex_hidden_states) + + for idx in range(TTT_LENGTH): + is_last = idx == TTT_LENGTH - 1 + output = attention( + hidden_states=hidden_states_list[idx], + attention_mask=decoder_attention_mask, + position_ids=position_ids, + cache_hidden=cache_hidden, + output_attentions=False, + use_cache=True, + ) + output_flex = flex_attention( + hidden_states=flex_hidden_states_list[idx], + attention_mask=attention_mask, + position_ids=flex_position_ids, + past_key_values=past_key_values, + ) + # Apply loss mask on calculation over batch + loss = (output * loss_mask[..., None]).sum().mean() + loss_flex = (output_flex * loss_mask[..., None]).sum().mean() + torch.testing.assert_close(loss, loss_flex, atol=1e-2, rtol=1e-2) + loss_list.append(loss) + loss_flex_list.append(loss_flex) + # Compare gradients + + if not is_last: + # Step 5.7: we need to update the loss mask + loss_mask = padding(loss_mask, left=False) + mean_loss = sum(loss_list) / len(loss_list) + mean_loss_flex = sum(loss_flex_list) / len(loss_flex_list) + mean_loss.backward() + mean_loss_flex.backward() + projections = ["q_proj", "k_proj", "v_proj", "o_proj"] + for proj_name in projections: + torch.testing.assert_close( + getattr(attention, proj_name).weight.grad, + getattr(flex_attention, proj_name).weight.grad, + atol=1e-2, + rtol=1e-2, + ) + + +class TestEagle3FlexMask(unittest.TestCase): + + def test_eagle3_flex_mask(self): + B = 1 + H = 1 + S = 128 * 8 + D = 128 + Q_LEN = S + KV_LEN = S * 3 + lck = 128 * 2 + data_type = torch.bfloat16 + query = norm_tensor((B, H, S, D), device="cuda", dtype=data_type) + key_cache = norm_tensor((B, H, KV_LEN, D), device="cuda", dtype=data_type) + value_cache = norm_tensor((B, H, KV_LEN, D), device="cuda", dtype=data_type) + seq_lengths = torch.tensor([S], device="cuda", dtype=torch.int32) + seq_lengths -= lck + block_mask = compile_friendly_create_block_mask( + mask_mod=generate_eagle3_mask( + seq_lengths=seq_lengths, Q_LEN=Q_LEN, KV_LEN=KV_LEN, lck=lck + ), + B=1, + H=1, + Q_LEN=Q_LEN, + KV_LEN=KV_LEN, + device=query.device, + ) + # fmt: off + expected_mask = torch.tensor([[[ + [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + ]]], dtype=torch.int32).to(query.device) + # fmt: on + dense_mask = block_mask.to_dense() + assert torch.allclose(dense_mask, expected_mask) + output = compile_friendly_flex_attention( + query, key_cache, value_cache, block_mask=block_mask + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/SpecForge-ext/tests/test_utils/test_loss.py b/SpecForge-ext/tests/test_utils/test_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..56dc47ca513fa205f335bbe74739a25f2dac2c14 --- /dev/null +++ b/SpecForge-ext/tests/test_utils/test_loss.py @@ -0,0 +1,87 @@ +import unittest + +import torch + +from specforge.core.loss import LogSoftmaxLoss, _compute_loss + +from .utils import norm_tensor + + +class TestLogSoftmaxLoss(unittest.TestCase): + + TTT_LENGTH = 7 + + def _test_loss_and_gradient_calculation(self, B, T, V): + if not torch.cuda.is_available(): + device = "cpu" + else: + device = "cuda" + + logits = norm_tensor((B, T, V), device, torch.float32) + logits2 = logits.clone().detach().requires_grad_(True) + target = norm_tensor((B, T, V), device, torch.float32) + position_mask = torch.randint(0, 2, (B, T, 1), dtype=torch.bool, device=device) + + output1 = LogSoftmaxLoss.apply(logits, target, position_mask) + output2 = _compute_loss(logits2, target, position_mask) + torch.testing.assert_close(output1, output2, rtol=1e-4, atol=1e-4) + + output1.backward() + output2.backward() + torch.testing.assert_close(logits.grad, logits2.grad, rtol=1e-4, atol=1e-4) + + def test_loss(self): + B = [1, 2, 4] + T = [1024, 2048, 4096, 6000] + V = [4096, 8192, 10000] + for b in B: + for t in T: + for v in V: + self._test_loss_and_gradient_calculation(b, t, v) + + def test_ttt_loss_accumulation(self): + if not torch.cuda.is_available(): + device = "cpu" + else: + device = "cuda" + + B, T, V = 1, 1024, 3200 + plosses = [] + plosses_compare = [] + logits_list = [ + norm_tensor((B, T, V), device, torch.float32) + for _ in range(self.TTT_LENGTH) + ] + logits_list_copy = [ + logits.clone().detach().requires_grad_(True) for logits in logits_list + ] + for i in range(self.TTT_LENGTH): + logits = logits_list[i] + logits2 = logits_list_copy[i] + target = norm_tensor((B, T, V), device, torch.float32) + position_mask = torch.randint( + 0, 2, (B, T, 1), dtype=torch.bool, device=device + ) + + output1 = LogSoftmaxLoss.apply(logits, target, position_mask) + output2 = _compute_loss(logits2, target, position_mask) + torch.testing.assert_close(output1, output2, rtol=1e-4, atol=1e-4) + plosses.append(output1) + plosses_compare.append(output2) + + ploss_weight = [0.8**i for i in range(len(plosses))] + ploss = ( + sum([ploss_weight[i] * plosses[i] for i in range(len(plosses))]) + / self.TTT_LENGTH + ) + ploss_compare = ( + sum([ploss_weight[i] * plosses_compare[i] for i in range(len(plosses))]) + / self.TTT_LENGTH + ) + torch.testing.assert_close(ploss, ploss_compare, rtol=1e-4, atol=1e-4) + ploss.backward() + ploss_compare.backward() + for i in range(self.TTT_LENGTH): + torch.testing.assert_close( + logits_list[i].grad, logits_list_copy[i].grad, rtol=1e-4, atol=1e-4 + ) diff --git a/SpecForge-ext/tests/test_utils/utils.py b/SpecForge-ext/tests/test_utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5688218f5ac82961ede231262ab642f30ceaf5a5 --- /dev/null +++ b/SpecForge-ext/tests/test_utils/utils.py @@ -0,0 +1,8 @@ +import torch +import torch.nn.init + + +def norm_tensor(shape, device, dtype, std=0.02): + t = torch.empty(shape, device=device, dtype=dtype, requires_grad=True) + torch.nn.init.trunc_normal_(t, mean=0.0, std=std) + return t diff --git a/SpecForge-ext/tests/test_vlm/__init__.py b/SpecForge-ext/tests/test_vlm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge-ext/tests/test_vlm/test_qwenvl_loss_mask.py b/SpecForge-ext/tests/test_vlm/test_qwenvl_loss_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..74523861fde945e15ae3a098fec0bf65f90eb18d --- /dev/null +++ b/SpecForge-ext/tests/test_vlm/test_qwenvl_loss_mask.py @@ -0,0 +1,64 @@ +from transformers import AutoProcessor + +from specforge.data.preprocessing import preprocess_vlm_conversations +from specforge.data.template import TEMPLATE_REGISTRY + +model_path = "Qwen/Qwen2.5-VL-7B-Instruct" +processor = AutoProcessor.from_pretrained(model_path) +# ANSI color codes +RED = "\033[91m" +GREEN = "\033[92m" +RESET = "\033[0m" + + +def test_preprocess_vlm_conversations(): + conversations = [ + {"role": "user", "content": "what is in the image?"}, + {"role": "assistant", "content": "This is an image of a cat."}, + ] + + examples = { + "id": ["example1"], + "image": [ + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + ], + "conversations": [conversations], + } + # examples = [examples] # Wrap in a list to match expected input format + chat_template = TEMPLATE_REGISTRY.get("qwen2-vl") + processed = preprocess_vlm_conversations(processor, examples, chat_template, 4096) + + input_ids = processed["input_ids"][0] + print(f"Loss mask sum: {processed['loss_mask'][0].sum()}") + loss_mask = processed["loss_mask"][0].squeeze(0).tolist() + input_ids = input_ids.squeeze(0) + current_mask = input_ids[0] + current_ids = [] + + for i in range(len(input_ids)): + # for i in range(input_ids.shape[-1]): + if current_mask == loss_mask[i]: + current_ids.append(input_ids[i]) + else: + decoded_text = processor.tokenizer.decode( + current_ids, skip_special_tokens=False + ) + if current_mask == 0: + print(f"{RED}{decoded_text}{RESET}", end="") + else: + print(f"{GREEN}{decoded_text}{RESET}", end="") + current_ids = [input_ids[i]] + current_mask = loss_mask[i] + + print( + f"{GREEN}{processor.tokenizer.decode(current_ids, skip_special_tokens=False)}{RESET}" + ) + + print() + print(f"input_ids shape: {processed['input_ids'][0].shape}") + print(f"loss_mask shape: {processed['loss_mask'][0].shape}") + # print(f"hidden_state shape: {processed['hidden_state'].shape}") + + +if __name__ == "__main__": + test_preprocess_vlm_conversations() diff --git a/SpecForge/.github/CODEOWNERS b/SpecForge/.github/CODEOWNERS new file mode 100644 index 0000000000000000000000000000000000000000..e4dbc44f0f9b24da1ad6a96eff14abe45f184255 --- /dev/null +++ b/SpecForge/.github/CODEOWNERS @@ -0,0 +1,11 @@ +.github @FrankLeeeee +/specforge/core @FrankLeeeee +/specforge/data @zyksir @sleepcoo @shuaills +/specforge/layers @FrankLeeeee @FlamingoPg @sleepcoo @shuaills +/specforge/modeling @FlamingoPg @sleepcoo @shuaills @FrankLeeeee +/tests @FrankLeeeee +/assets @FrankLeeeee @zhyncs +/examples @shuaills @sleepcoo @FlamingoPg +/configs @FrankLeeeee @FlamingoPg +/benchmarks @FrankLeeeee +/scripts @shuaills @sleepcoo @FlamingoPg diff --git a/SpecForge/.github/pull_request_template.md b/SpecForge/.github/pull_request_template.md new file mode 100644 index 0000000000000000000000000000000000000000..296468dfb8c84c38784759283db598959572a91f --- /dev/null +++ b/SpecForge/.github/pull_request_template.md @@ -0,0 +1,30 @@ + + +## Motivation + + + +## Modifications + + + +## Related Issues + + + +## Accuracy Test + + + +## Benchmark & Profiling + + + +## Checklist + +- [ ] Format your code according to the [Code Formatting with Pre-Commit](https://docs.sglang.ai/references/contribution_guide.html#code-formatting-with-pre-commit). +- [ ] Add unit tests as outlined in the [Running Unit Tests](https://docs.sglang.ai/references/contribution_guide.html#running-unit-tests-adding-to-ci). +- [ ] Update documentation / docstrings / example tutorials as needed, according to [Writing Documentation](https://docs.sglang.ai/references/contribution_guide.html#writing-documentation-running-docs-ci). +- [ ] Provide throughput / latency benchmark results and accuracy evaluation results as needed, according to [Benchmark and Profiling](https://docs.sglang.ai/references/benchmark_and_profiling.html) and [Accuracy Results](https://docs.sglang.ai/references/accuracy_evaluation.html). +- [ ] For reviewers: If you haven't made any contributions to this PR and are only assisting with merging the main branch, please remove yourself as a co-author when merging the PR. +- [ ] Please feel free to join our Slack channel at https://sgl-fru7574.slack.com/archives/C09784E3EN6 to discuss your PR. diff --git a/SpecForge/configs/deepseek-v2-lite-eagle3.json b/SpecForge/configs/deepseek-v2-lite-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..da12c0fb4444a55773ac0f84f4360f3476a39d09 --- /dev/null +++ b/SpecForge/configs/deepseek-v2-lite-eagle3.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 100000, + "eos_token_id": 100001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 10944, + "max_position_embeddings": 163840, + "max_window_layers": 64, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 16, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "beta_fast": 32.0, + "beta_slow": 1.0, + "factor": 40.0, + "mscale": 0.707, + "mscale_all_dim": 0.707, + "original_max_position_embeddings": 4096, + "rope_type": "yarn" + }, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 102400, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/deepseek-v3-671b-eagle3.json b/SpecForge/configs/deepseek-v3-671b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..147a5fdcd32c7ccd83248eec16dc709ed34e8bce --- /dev/null +++ b/SpecForge/configs/deepseek-v3-671b-eagle3.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 29, + 57 + ], + "use_aux_hidden_state": true + }, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 7168, + "initializer_range": 0.02, + "intermediate_size": 40960, + "max_position_embeddings": 163840, + "model_type": "llama", + "num_attention_heads": 56, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.51.0", + "use_cache": true, + "vocab_size": 129280, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/gemma3-1b-eagle3.json b/SpecForge/configs/gemma3-1b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e74eb16a3e47ac9ff4357106ff7c2afe4186da --- /dev/null +++ b/SpecForge/configs/gemma3-1b-eagle3.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 1152, + "initializer_range": 0.02, + "intermediate_size": 6912, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 4, + "num_hidden_layers": 1, + "num_key_value_heads": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": 512, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.50.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 262145, + "draft_vocab_size": 32000, + "target_model_type": "gemma3_text" +} diff --git a/SpecForge/configs/gpt-oss-120B-eagle3.json b/SpecForge/configs/gpt-oss-120B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b36c7687620c95e90b4ec43ee8a53763826954 --- /dev/null +++ b/SpecForge/configs/gpt-oss-120B-eagle3.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 17, + 33 + ] + }, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initializer_range": 0.02, + "intermediate_size": 17280, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": true, + "vocab_size": 201088, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/gpt-oss-20B-eagle3.json b/SpecForge/configs/gpt-oss-20B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..e1d4b257d9644032488a31a67aca8719ffdbe33e --- /dev/null +++ b/SpecForge/configs/gpt-oss-20B-eagle3.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 11, + 21 + ] + }, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initializer_range": 0.02, + "intermediate_size": 17280, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": true, + "vocab_size": 201088, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/ling-flash-2.0-eagle3.json b/SpecForge/configs/ling-flash-2.0-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a9bea37c06ae29010eade7cd4b70cdf4e9e0316 --- /dev/null +++ b/SpecForge/configs/ling-flash-2.0-eagle3.json @@ -0,0 +1,24 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "bos_token_id": 163584, + "eos_token_id": 163585, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.57.1", + "use_cache": true, + "vocab_size": 157184, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/llama3-70B-ealge3.json b/SpecForge/configs/llama3-70B-ealge3.json new file mode 100644 index 0000000000000000000000000000000000000000..20d04f4d0dc09fe2894a7a35673b3a8afdaa8e32 --- /dev/null +++ b/SpecForge/configs/llama3-70B-ealge3.json @@ -0,0 +1,37 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 4096, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 128256, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/llama3-8B-eagle3.json b/SpecForge/configs/llama3-8B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..775ad6afee3c43946742b823b8f4e3d48af68b3c --- /dev/null +++ b/SpecForge/configs/llama3-8B-eagle3.json @@ -0,0 +1,24 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "bos_token_id": 128000, + "eos_token_id": 128001, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 2048, + "model_type": "llama", + "num_attention_heads": 32, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 128256, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/longcat-flash-dflash.json b/SpecForge/configs/longcat-flash-dflash.json new file mode 100644 index 0000000000000000000000000000000000000000..66e9b33a614a15dc3c5df35d9f6cb8aabe818d61 --- /dev/null +++ b/SpecForge/configs/longcat-flash-dflash.json @@ -0,0 +1,45 @@ +{ + "architectures": [ + "DFlashDraftModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 1, + "dflash_config": { + "mask_token_id": 2, + "target_layer_ids": [1, 7, 13, 19, 25] + }, + "dtype": "bfloat16", + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 5, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "num_target_layers": 28, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 131072 + } diff --git a/SpecForge/configs/longcat-flash-eagle3.json b/SpecForge/configs/longcat-flash-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..7b3b921a22378353f010d1ee1ba03ec44610eb75 --- /dev/null +++ b/SpecForge/configs/longcat-flash-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 131072, + "max_window_layers": 48, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 1, + "num_key_value_heads":16, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 131072, + "draft_vocab_size": 131072 + } diff --git a/SpecForge/configs/phi4-eagle3.json b/SpecForge/configs/phi4-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..05456a0d239653cdc898413860c6822d8a7cdec5 --- /dev/null +++ b/SpecForge/configs/phi4-eagle3.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 100257, + "eos_token_id": 100257, + "pad_token_id": 100257, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17920, + "max_position_embeddings": 16384, + "model_type": "phi3", + "num_attention_heads": 40, + "num_hidden_layers": 1, + "num_key_value_heads": 10, + "rms_norm_eps": 1e-05, + "rope_theta": 250000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "use_cache": true, + "vocab_size": 100352, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen2-5-vl-7b-eagle3.json b/SpecForge/configs/qwen2-5-vl-7b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..672193e3b1284badcb747356f1cbfcd402e19ccf --- /dev/null +++ b/SpecForge/configs/qwen2-5-vl-7b-eagle3.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 8192, + "max_window_layers": 28, + "model_type": "llama", + "target_model_type": "qwen2_5_vl", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "pretraining_tp": 1, + "rope_scaling": { + "type": "mrope", + "mrope_section": [ + 16, + 24, + 24 + ] + }, + "rope_theta": 1000000, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 32000 + } diff --git a/SpecForge/configs/qwen2.5-7b-eagle3.json b/SpecForge/configs/qwen2.5-7b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..f16f6b8d07b120734f1eafd8c2e7881e424a57a1 --- /dev/null +++ b/SpecForge/configs/qwen2.5-7b-eagle3.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "llama", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 16000 +} diff --git a/SpecForge/configs/qwen2.5-vl-32b-eagle3.json b/SpecForge/configs/qwen2.5-vl-32b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..76aa04cdf7cdf706443308f72f5e487cf6f510ff --- /dev/null +++ b/SpecForge/configs/qwen2.5-vl-32b-eagle3.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 8192, + "max_window_layers": 28, + "model_type": "llama", + "target_model_type": "qwen2_5_vl", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "pretraining_tp": 1, + "rope_scaling": { + "type": "mrope", + "mrope_section": [ + 16, + 24, + 24 + ] + }, + "rope_theta": 1000000, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 32000 + } diff --git a/SpecForge/configs/qwen3-235B-A22B-eagle3.json b/SpecForge/configs/qwen3-235B-A22B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..8e28c04a18a851c968252b1691b89dcdcff598b9 --- /dev/null +++ b/SpecForge/configs/qwen3-235B-A22B-eagle3.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 46, + 90 + ], + "use_aux_hidden_state": true + }, + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "draft_vocab_size": 32000, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 24576, + "max_position_embeddings": 40960, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "rope_scaling": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "vocab_size": 151936 +} diff --git a/SpecForge/configs/qwen3-30B-A3B-eagle3.json b/SpecForge/configs/qwen3-30B-A3B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..558cb18043a5bd182497536203de90a4a7672f35 --- /dev/null +++ b/SpecForge/configs/qwen3-30B-A3B-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 2048, + "max_window_layers": 48, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads":4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen3-32b-eagle3.json b/SpecForge/configs/qwen3-32b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..cf128d9fb451833207c0a4293554357f324aea8c --- /dev/null +++ b/SpecForge/configs/qwen3-32b-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 25600, + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen3-4b-eagle3.json b/SpecForge/configs/qwen3-4b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..41ae128fdcd532f1e31c6251819d29aedfa9d3e6 --- /dev/null +++ b/SpecForge/configs/qwen3-4b-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen3-8b-dflash.json b/SpecForge/configs/qwen3-8b-dflash.json new file mode 100644 index 0000000000000000000000000000000000000000..518860725a65bae6674c0af60643394ef174f2d9 --- /dev/null +++ b/SpecForge/configs/qwen3-8b-dflash.json @@ -0,0 +1,45 @@ +{ + "architectures": [ + "DFlashDraftModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 151643, + "dflash_config": { + "mask_token_id": 151669, + "target_layer_ids": [1, 9, 17, 25, 33] + }, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 5, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "num_target_layers": 36, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/SpecForge/configs/qwen3-8b-eagle3.json b/SpecForge/configs/qwen3-8b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1fa44906d6decad8ccee5c8296699b1db5750f1 --- /dev/null +++ b/SpecForge/configs/qwen3-8b-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads":8 , + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen3-coder-30B-A3B-instruct-eagle3.json b/SpecForge/configs/qwen3-coder-30B-A3B-instruct-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..f296c237973a83f40f4540a97bbc193e2593bb44 --- /dev/null +++ b/SpecForge/configs/qwen3-coder-30B-A3B-instruct-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 2048, + "max_window_layers": 48, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen3-coder-480B-A35B-instruct-eagle3.json b/SpecForge/configs/qwen3-coder-480B-A35B-instruct-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..2f27c80cc017e811f8846f2161a977725e669086 --- /dev/null +++ b/SpecForge/configs/qwen3-coder-480B-A35B-instruct-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 262144, + "max_window_layers": 62, + "model_type": "llama", + "num_attention_heads": 96, + "num_hidden_layers": 1, + "num_key_value_heads":8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/configs/qwen3-next-80b-a3b-eagle3.json b/SpecForge/configs/qwen3-next-80b-a3b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..e94a2ea3407d784ee9fbd4b6a15b96cd7cadfec8 --- /dev/null +++ b/SpecForge/configs/qwen3-next-80b-a3b-eagle3.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "decoder_sparse_step": 1, + "eos_token_id": 151645, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 262144, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.57.0.dev0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 + } diff --git a/SpecForge/configs/qwen3.5-35b-a3b-eagle3.json b/SpecForge/configs/qwen3.5-35b-a3b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..803962836d1145d40350b4e3c552446e5f3e81c6 --- /dev/null +++ b/SpecForge/configs/qwen3.5-35b-a3b-eagle3.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "decoder_sparse_step": 1, + "eos_token_id": 248044, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 262144, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.57.0.dev0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 248320, + "draft_vocab_size": 32000 + } diff --git a/SpecForge/configs/qwq-32B-eagle3.json b/SpecForge/configs/qwq-32B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7d7908d5433c886a1725c1ec456f032ba80202 --- /dev/null +++ b/SpecForge/configs/qwq-32B-eagle3.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 27648, + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "qwen2", + "num_attention_heads": 40, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.43.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 32000 +} diff --git a/SpecForge/datasets/README.md b/SpecForge/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8ddbef6d72d759dc06d8e59c15ef73c0ec29c204 --- /dev/null +++ b/SpecForge/datasets/README.md @@ -0,0 +1,5 @@ +## Store Comprehensive Datasets Download Scripts + +| DatasetName | Github | Huggingface | command | +| -------- | -------- | -------- | -------- | +| ALLaVA-4V | [link](https://github.com/FreedomIntelligence/ALLaVA) | [link](https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V) | download_laion.sh | diff --git a/SpecForge/datasets/download_laion.sh b/SpecForge/datasets/download_laion.sh new file mode 100644 index 0000000000000000000000000000000000000000..a64d061ebb5de06b2e87cfc3bcd2b38508b7009e --- /dev/null +++ b/SpecForge/datasets/download_laion.sh @@ -0,0 +1,36 @@ + + +laion_root="allava_laion" + +mkdir $laion_root +cd $laion_root + + +# 1. download annotation files +## 1.1 caption +wget -c -O ALLaVA-Caption-LAION-4V.json https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/ALLaVA-Caption-LAION-4V.json?download=true + +## 1.2 instruction +wget -c -O ALLaVA-Instruct-LAION-4V.json https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/ALLaVA-Instruct-LAION-4V.json?download=true + + +# 2. download and upzip images +mkdir image_chunks + +## 2.1 download +for ((i=0; i<10; i++)) +do + wget -c -O image_chunks/images_$i.zip https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/image_chunks/images_$i.zip?download=true & +done + +mkdir -p images/ +wait + +## 2.2 unzip +for ((i=0; i<10; i++)) +do + unzip -j -o image_chunks/images_$i.zip -d images/ & # wait patiently, it takes a while... +done + +wait +echo "All done!" diff --git a/SpecForge/docs/Makefile b/SpecForge/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6b8792c428564ace773add1f751f7c2471a8fe83 --- /dev/null +++ b/SpecForge/docs/Makefile @@ -0,0 +1,58 @@ +# Minimal Makefile for Sphinx documentation +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SPHINXAUTOBUILD ?= sphinx-autobuild +SOURCEDIR = . +BUILDDIR = _build +PORT ?= 8003 + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @echo "" + @echo "Additional targets:" + @echo " serve to build and serve documentation with auto-build and live reload" + +# Compile Notebook files and record execution time +compile: + @set -e; \ + echo "Starting Notebook compilation..."; \ + mkdir -p logs; \ + echo "Notebook execution timings:" > logs/timing.log; \ + START_TOTAL=$$(date +%s); \ + find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \ + parallel -0 -j3 --halt soon,fail=1 ' \ + NB_NAME=$$(basename {}); \ + START_TIME=$$(date +%s); \ + retry --delay=0 --times=2 -- \ + jupyter nbconvert --to notebook --execute --inplace "{}" \ + --ExecutePreprocessor.timeout=600 \ + --ExecutePreprocessor.kernel_name=python3; \ + RET_CODE=$$?; \ + END_TIME=$$(date +%s); \ + ELAPSED_TIME=$$((END_TIME - START_TIME)); \ + echo "$${NB_NAME}: $${ELAPSED_TIME}s" >> logs/timing.log; \ + exit $$RET_CODE' || exit 1; \ + END_TOTAL=$$(date +%s); \ + TOTAL_ELAPSED=$$((END_TOTAL - START_TOTAL)); \ + echo "---------------------------------" >> logs/timing.log; \ + echo "Total execution time: $${TOTAL_ELAPSED}s" >> logs/timing.log; \ + echo "All Notebook execution timings:" && cat logs/timing.log + +# Serve documentation with auto-build and live reload +serve: + @echo "Starting auto-build server at http://0.0.0.0:$(PORT)" + @$(SPHINXAUTOBUILD) "$(SOURCEDIR)" "$(BUILDDIR)/html" \ + --host 0.0.0.0 \ + --port $(PORT) \ + --watch $(SOURCEDIR) \ + --re-ignore ".*\.(ipynb_checkpoints|pyc|pyo|pyd|git)" + +.PHONY: help Makefile compile clean serve + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + find . -name "*.ipynb" -exec nbstripout {} \; + rm -rf $(BUILDDIR) + rm -rf logs diff --git a/SpecForge/docs/README.md b/SpecForge/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..592f0e51a0f9be1b4aa959867fb526ed4003c149 --- /dev/null +++ b/SpecForge/docs/README.md @@ -0,0 +1,55 @@ +# SpecForge Documentation + +We recommend new contributors to start from writing documentation, which helps you quickly understand the SpecForge codebase. +Most documentation files are located under the `docs/` folder. + +## Docs Workflow + +### Install Dependency + +```bash +apt-get update && apt-get install -y pandoc parallel retry +pip install -r requirements.txt +``` + +### Update Documentation + +Update your Jupyter notebooks in the appropriate subdirectories under `docs/`. If you add new files, remember to update `index.rst` (or relevant `.rst` files) accordingly. + +- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request. + +```bash +# 1) Compile all Jupyter notebooks +make compile # This step can take a long time (10+ mins). You can consider skipping this step if you can make sure your added files are correct. +make html + +# 2) Compile and Preview documentation locally with auto-build +# This will automatically rebuild docs when files change +# Open your browser at the displayed port to view the docs +bash serve.sh + +# 2a) Alternative ways to serve documentation +# Directly use make serve +make serve +# With custom port +PORT=8080 make serve + +# 3) Clean notebook outputs +# nbstripout removes notebook outputs so your PR stays clean +pip install nbstripout +find . -name '*.ipynb' -exec nbstripout {} \; + +# 4) Pre-commit checks and create a PR +# After these checks pass, push your changes and open a PR on your branch +pre-commit run --all-files +``` +--- + +## Documentation Style Guidelines + +- For common functionalities, we prefer **Jupyter Notebooks** over Markdown so that all examples can be executed and validated by our docs CI pipeline. For complex features (e.g., distributed serving), Markdown is preferred. +- Keep in mind the documentation execution time when writing interactive Jupyter notebooks. Each interactive notebook will be run and compiled against every commit to ensure they are runnable, so it is important to apply some tips to reduce the documentation compilation time: + - Use small models (e.g., `qwen/qwen2.5-0.5b-instruct`) for most cases to reduce server launch time. + - Reuse the launched server as much as possible to reduce server launch time. +- Do not use absolute links (e.g., `https://docs.sglang.ai/get_started/install.html`). Always prefer relative links (e.g., `../get_started/install.md`). +- Follow the existing examples to learn how to launch a server, send a query and other common styles. diff --git a/SpecForge/docs/conf.py b/SpecForge/docs/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..f1fef2396e931693259e82aee2e78cdb77d6c256 --- /dev/null +++ b/SpecForge/docs/conf.py @@ -0,0 +1,188 @@ +import os +import sys +from datetime import datetime +from pathlib import Path + +sys.path.insert(0, os.path.abspath("../..")) + +DOCS_PATH = Path(__file__).parent +ROOT_PATH = DOCS_PATH.parent + +version_file = ROOT_PATH.joinpath("version.txt") +with open(version_file, "r") as f: + __version__ = f.read().strip() + +project = "SGLang" +copyright = f"2025-{datetime.now().year}, SpecForge" +author = "SpecForge Team" + +version = __version__ +release = __version__ + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.autosectionlabel", + "sphinx.ext.intersphinx", + "sphinx_tabs.tabs", + "myst_parser", + "sphinx_copybutton", + "sphinxcontrib.mermaid", + "nbsphinx", + "sphinx.ext.mathjax", +] + +nbsphinx_allow_errors = True +nbsphinx_execute = "never" + +autosectionlabel_prefix_document = True +nbsphinx_allow_directives = True + + +myst_enable_extensions = [ + "dollarmath", + "amsmath", + "deflist", + "colon_fence", + "html_image", + "substitution", +] + +myst_heading_anchors = 5 + +nbsphinx_kernel_name = "python3" +nbsphinx_execute_arguments = [ + "--InlineBackend.figure_formats={'svg', 'pdf'}", + "--InlineBackend.rc={'figure.dpi': 96}", +] + + +nb_render_priority = { + "html": ( + "application/vnd.jupyter.widget-view+json", + "application/javascript", + "text/html", + "image/svg+xml", + "image/png", + "image/jpeg", + "text/markdown", + "text/latex", + "text/plain", + ) +} + +myst_ref_domains = ["std", "py"] + +templates_path = ["_templates"] + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +master_doc = "index" + +language = "en" + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +pygments_style = "sphinx" + +html_theme = "sphinx_book_theme" +html_logo = ROOT_PATH.joinpath("assets/logo.png").as_posix() +html_favicon = ROOT_PATH.joinpath("assets/logo.ico").as_posix() +html_title = project +html_copy_source = True +html_last_updated_fmt = "" + +html_theme_options = { + "repository_url": "https://github.com/sgl-project/sgl-project.github.io", + "repository_branch": "main", + "show_navbar_depth": 3, + "max_navbar_depth": 4, + "collapse_navbar": True, + "use_edit_page_button": True, + "use_source_button": True, + "use_issues_button": True, + "use_repository_button": True, + "use_download_button": True, + "use_sidenotes": True, + "show_toc_level": 2, +} + +html_context = { + "display_github": True, + "github_user": "sgl-project", + "github_repo": "sgl-project.github.io", + "github_version": "main", + "conf_py_path": "/docs/", +} + +html_static_path = ["_static", "spec_bundle/public"] +html_css_files = ["css/custom_log.css"] + + +def setup(app): + app.add_css_file("css/custom_log.css") + + +htmlhelp_basename = "sglangdoc" + +latex_elements = {} + +latex_documents = [ + (master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"), +] + +man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)] + +texinfo_documents = [ + ( + master_doc, + "sglang", + "sglang Documentation", + author, + "sglang", + "One line description of project.", + "Miscellaneous", + ), +] + +epub_title = project + +epub_exclude_files = ["search.html"] + +copybutton_prompt_text = r">>> |\.\.\. " +copybutton_prompt_is_regexp = True + +autodoc_preserve_defaults = True +navigation_with_keys = False + +autodoc_mock_imports = [ + "torch", + "transformers", + "triton", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3.12", None), + "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "pillow": ("https://pillow.readthedocs.io/en/stable", None), + "numpy": ("https://numpy.org/doc/stable", None), + "torch": ("https://pytorch.org/docs/stable", None), +} + +html_theme = "sphinx_book_theme" + + +nbsphinx_prolog = """ +.. raw:: html + + +""" diff --git a/SpecForge/docs/deploy.py b/SpecForge/docs/deploy.py new file mode 100644 index 0000000000000000000000000000000000000000..75b7ea7f23dce0a5deb17c28d78b5cc59833a4d6 --- /dev/null +++ b/SpecForge/docs/deploy.py @@ -0,0 +1,22 @@ +# Deploy the documents + +import os +from datetime import datetime + + +def run_cmd(cmd): + print(cmd) + os.system(cmd) + + +run_cmd("cd $DOC_SITE_PATH; git pull") + +# (Optional) Remove old files +# run_cmd("rm -rf $ALPA_SITE_PATH/*") + +run_cmd("cp -r _build/html/* $DOC_SITE_PATH") + +cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" +run_cmd( + f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main" +) diff --git a/SpecForge/docs/index.rst b/SpecForge/docs/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..bc2c694798793eddd894f5bd94fde539b9fb06b8 --- /dev/null +++ b/SpecForge/docs/index.rst @@ -0,0 +1,53 @@ +SpecForge Documentation +======================= + +SpecForge is an ecosystem project developed by the SGLang team. It is a framework for training speculative decoding models so that you can smoothly port them over to the SGLang serving framework to speed up your inference. + + +.. toctree:: + :maxdepth: 1 + :caption: Get Started + + get_started/installation.md + get_started/about.md + +.. toctree:: + :maxdepth: 1 + :caption: Concepts + + concepts/speculative_decoding.md + concepts/EAGLE3.md + + +.. toctree:: + :maxdepth: 1 + :caption: Basic Usage + + basic_usage/data_preparation.md + basic_usage/training.md + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Features + + advanced_features/customization.md + +.. toctree:: + :maxdepth: 1 + :caption: Community Resources + + community_resources/specbundle.md + community_resources/dashboard.md + +.. toctree:: + :maxdepth: 1 + :caption: Examples + + examples/llama3-eagle3-online.md + examples/llama3-eagle3-offline.md + +.. toctree:: + :maxdepth: 1 + :caption: Benchmarks + + benchmarks/benchmark.md diff --git a/SpecForge/docs/requirements.txt b/SpecForge/docs/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a7e5d4eba2f265cb2dce4eff31d770eb71125f3 --- /dev/null +++ b/SpecForge/docs/requirements.txt @@ -0,0 +1,20 @@ +ipykernel +ipywidgets +jupyter_client +markdown>=3.4.0 +matplotlib +myst-parser +nbconvert +nbsphinx +pandoc +pillow +pydantic +sphinx +sphinx-book-theme +sphinx-copybutton +sphinx-tabs +nbstripout +sphinxcontrib-mermaid +urllib3<2.0.0 +gguf>=0.10.0 +sphinx-autobuild diff --git a/SpecForge/docs/serve.sh b/SpecForge/docs/serve.sh new file mode 100644 index 0000000000000000000000000000000000000000..049f767cf497a5fd92b1dac0af2fc13fdcf3fa69 --- /dev/null +++ b/SpecForge/docs/serve.sh @@ -0,0 +1,3 @@ +# Clean and serve documentation with auto-build +make clean +make serve diff --git a/SpecForge/specforge/__init__.py b/SpecForge/specforge/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b07280a0d9e106da207bd1b75de4a22a2de215b1 --- /dev/null +++ b/SpecForge/specforge/__init__.py @@ -0,0 +1,4 @@ +from .core import * # noqa +from .modeling import * # noqa + +__all__ = ["modeling", "core"] diff --git a/SpecForge/specforge/args.py b/SpecForge/specforge/args.py new file mode 100644 index 0000000000000000000000000000000000000000..2cd5efc30f5500b8df7f93fe2963de0dd7e38162 --- /dev/null +++ b/SpecForge/specforge/args.py @@ -0,0 +1,219 @@ +import argparse +from dataclasses import dataclass +from typing import Any, Dict, List + +from sglang.srt.server_args import ATTENTION_BACKEND_CHOICES + + +@dataclass +class TrackerArgs: + report_to: str = "none" + wandb_project: str = None + wandb_name: str = None + wandb_key: str = None + wandb_offline: bool = False + wandb_dir: str = None + swanlab_project: str = None + swanlab_name: str = None + swanlab_key: str = None + mlflow_experiment_id: str = None + mlflow_run_name: str = None + mlflow_run_id: str = None + mlflow_tracking_uri: str = None + mlflow_registry_uri: str = None + + @staticmethod + def add_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--report-to", + type=str, + default="none", + choices=["wandb", "tensorboard", "swanlab", "mlflow", "none"], + help="The integration to report results and logs to.", + ) + # wandb-specific args + parser.add_argument("--wandb-project", type=str, default=None) + parser.add_argument("--wandb-name", type=str, default=None) + parser.add_argument("--wandb-key", type=str, default=None, help="W&B API key.") + parser.add_argument( + "--wandb-offline", + action="store_true", + help="Enable W&B offline mode and store logs locally.", + ) + parser.add_argument( + "--wandb-dir", + type=str, + default=None, + help="Directory to store W&B files. Defaults to './wandb' under the project root when using W&B.", + ) + # swanlab-specific args + parser.add_argument( + "--swanlab-project", + type=str, + default=None, + help="The project name for swanlab.", + ) + parser.add_argument( + "--swanlab-name", + type=str, + default=None, + help="The experiment name for swanlab.", + ) + parser.add_argument( + "--swanlab-key", + type=str, + default=None, + help="The API key for swanlab non-interactive login.", + ) + # mlflow-specific args + parser.add_argument( + "--mlflow-tracking-uri", + type=str, + default=None, + help="The MLflow tracking URI. If not set, uses MLFLOW_TRACKING_URI environment variable or defaults to local './mlruns'.", + ) + parser.add_argument( + "--mlflow-experiment-name", + type=str, + default=None, + help="The MLflow experiment name. If not set, uses MLFLOW_EXPERIMENT_NAME environment variable.", + ) + parser.add_argument( + "--mlflow-run-name", + type=str, + default=None, + help="The MLflow run name. If not set, MLflow will auto-generate one.", + ) + + +@dataclass +class SGLangBackendArgs: + sglang_attention_backend: str = "fa3" + sglang_mem_fraction_static: float = 0.4 + sglang_context_length: int = None + sglang_enable_nccl_nvls: bool = False + sglang_enable_symm_mem: bool = False + sglang_enable_torch_compile: bool = True + sglang_enable_dp_attention: bool = False + sglang_enable_dp_lm_head: bool = False + sglang_enable_piecewise_cuda_graph: bool = False + sglang_piecewise_cuda_graph_max_tokens: int = 4096 + sglang_piecewise_cuda_graph_tokens: List[int] = None + sglang_ep_size: int = 1 + sglang_max_running_requests: int = None # assign based on batch size + sglang_max_total_tokens: int = None # assign based on batch size and seq length + + @staticmethod + def add_args(parser: argparse.ArgumentParser) -> None: + # sglang arguments + parser.add_argument( + "--sglang-attention-backend", + type=str, + default="flashinfer", + choices=ATTENTION_BACKEND_CHOICES, + help="The attention backend of SGLang backend", + ) + parser.add_argument( + "--sglang-mem-fraction-static", + type=float, + default=0.4, + help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", + ) + parser.add_argument( + "--sglang-context-length", + type=int, + default=None, + help="The context length of the SGLang backend", + ) + parser.add_argument( + "--sglang-enable-nccl-nvls", + action="store_true", + help="Enable NCCL NVLS for prefill heavy requests when available for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-symm-mem", + action="store_true", + help="Enable NCCL symmetric memory for fast collectives for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-torch-compile", + action="store_true", + help="Optimize the model with torch.compile for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-dp-attention", + action="store_true", + help="Enable DP attention for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-dp-lm-head", + action="store_true", + help="Enable piecewise CUDA graph for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-piecewise-cuda-graph", + action="store_true", + help="Enable piecewise CUDA graph for SGLang backend's prefill", + ) + parser.add_argument( + "--sglang-piecewise-cuda-graph-max-tokens", + type=int, + default=4096, + help="Set the max tokens for piecewise CUDA graph for SGLang backend", + ) + parser.add_argument( + "--sglang-piecewise-cuda-graph-tokens", + type=int, + nargs="+", + default=None, + help="Set the list of tokens when using piecewise cuda graph for SGLang backend", + ) + parser.add_argument( + "--sglang-ep-size", + type=int, + default=1, + help="The ep size of the SGLang backend", + ) + + @staticmethod + def from_args(args: argparse.Namespace) -> "SGLangBackendArgs": + return SGLangBackendArgs( + sglang_attention_backend=args.sglang_attention_backend, + sglang_mem_fraction_static=args.sglang_mem_fraction_static, + sglang_context_length=args.sglang_context_length, + sglang_enable_nccl_nvls=args.sglang_enable_nccl_nvls, + sglang_enable_symm_mem=args.sglang_enable_symm_mem, + sglang_enable_torch_compile=args.sglang_enable_torch_compile, + sglang_enable_dp_attention=args.sglang_enable_dp_attention, + sglang_enable_dp_lm_head=args.sglang_enable_dp_lm_head, + sglang_enable_piecewise_cuda_graph=args.sglang_enable_piecewise_cuda_graph, + sglang_piecewise_cuda_graph_max_tokens=args.sglang_piecewise_cuda_graph_max_tokens, + sglang_piecewise_cuda_graph_tokens=args.sglang_piecewise_cuda_graph_tokens, + sglang_ep_size=args.sglang_ep_size, + sglang_max_running_requests=( + args.target_batch_size if hasattr(args, "target_batch_size") else None + ), + sglang_max_total_tokens=( + args.target_batch_size * args.max_length + if hasattr(args, "target_batch_size") and hasattr(args, "max_length") + else None + ), + ) + + def to_kwargs(self) -> Dict[str, Any]: + return dict( + attention_backend=self.sglang_attention_backend, + mem_fraction_static=self.sglang_mem_fraction_static, + context_length=self.sglang_context_length, + enable_nccl_nvls=self.sglang_enable_nccl_nvls, + enable_symm_mem=self.sglang_enable_symm_mem, + enable_torch_compile=self.sglang_enable_torch_compile, + enable_dp_attention=self.sglang_enable_dp_attention, + enable_dp_lm_head=self.sglang_enable_dp_lm_head, + enable_piecewise_cuda_graph=self.sglang_enable_piecewise_cuda_graph, + piecewise_cuda_graph_max_tokens=self.sglang_piecewise_cuda_graph_max_tokens, + piecewise_cuda_graph_tokens=self.sglang_piecewise_cuda_graph_tokens, + ep_size=self.sglang_ep_size, + max_running_requests=self.sglang_max_running_requests, + max_total_tokens=self.sglang_max_total_tokens, + ) diff --git a/SpecForge/specforge/distributed.py b/SpecForge/specforge/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5e882c4d69bc2cf8e03afe4fc05f3d60bdc3c6 --- /dev/null +++ b/SpecForge/specforge/distributed.py @@ -0,0 +1,245 @@ +from datetime import timedelta +from typing import Any, Optional + +import torch +import torch.distributed as dist +from yunchang.globals import PROCESS_GROUP, set_seq_parallel_pg + +from specforge.utils import print_with_rank + +_DEVICE_MESH = None +_TP_DEVICE_MESH = None +_TP_GROUP = None +_DP_DEVICE_MESH = None +_DP_GROUP = None +_DRAFT_DP_GROUP = None +_DRAFT_SP_GROUP = None +_SP_ULYSSES_GROUP = None +_SP_RING_GROUP = None + + +def get_tp_group(): + global _TP_GROUP + return _TP_GROUP + + +def get_dp_group(): + global _DP_GROUP + return _DP_GROUP + + +def get_draft_dp_group(): + global _DRAFT_DP_GROUP + return _DRAFT_DP_GROUP + + +def get_draft_sp_group(): + global _DRAFT_SP_GROUP + return _DRAFT_SP_GROUP + + +def get_device_mesh(): + global _DEVICE_MESH + return _DEVICE_MESH + + +def get_tp_device_mesh(): + global _TP_DEVICE_MESH + return _TP_DEVICE_MESH + + +def get_dp_device_mesh(): + global _DP_DEVICE_MESH + return _DP_DEVICE_MESH + + +def get_sp_ulysses_group(): + global _SP_ULYSSES_GROUP + return _SP_ULYSSES_GROUP + + +def get_sp_ring_group(): + global _SP_RING_GROUP + return _SP_RING_GROUP + + +def init_distributed( + timeout: int = 10, tp_size: int = 1, sp_ulysses_size: int = 1, sp_ring_size: int = 1 +): + """Initialize distributed training. + + Args: + timeout(int): Timeout for collective communication in minutes + tp_size(int): The degree of tensor parallelism + """ + dist.init_process_group(backend="nccl", timeout=timedelta(minutes=timeout)) + local_rank = dist.get_rank() % torch.cuda.device_count() + torch.cuda.set_device(local_rank) + print_with_rank(f"bind to device {local_rank}") + + world_size = dist.get_world_size() + dp_size = world_size // tp_size + assert ( + world_size == tp_size * dp_size + ), f"world size must be divisible by tp size, now {world_size=}, {(tp_size * dp_size)=} " + + device_mesh = dist.device_mesh.init_device_mesh( + "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp") + ) + + assert ( + world_size % (sp_ulysses_size * sp_ring_size) == 0 + ), f"World size ({world_size}) cannot be evenly divided by total SP size ({sp_ulysses_size*sp_ring_size})" + + draft_dp_size = world_size // (sp_ulysses_size * sp_ring_size) + draft_device_mesh = dist.device_mesh.init_device_mesh( + "cuda", + (draft_dp_size, sp_ulysses_size * sp_ring_size), + mesh_dim_names=("draft_dp", "sp"), + ) + set_seq_parallel_pg(sp_ulysses_size, sp_ring_size, dist.get_rank(), world_size) + + print_with_rank(f"device mesh: {device_mesh}") + tp_group = device_mesh.get_group("tp") + dp_group = device_mesh.get_group("dp") + + sp_ulysses_group = PROCESS_GROUP.ULYSSES_PG + sp_ring_group = PROCESS_GROUP.RING_PG + # we need to create a 1D submesh + tp_device_mesh = dist.DeviceMesh.from_group(tp_group, device_type="cuda") + + global _TP_GROUP, _DP_GROUP, _DEVICE_MESH, _TP_DEVICE_MESH, _DP_DEVICE_MESH, _SP_RING_GROUP, _SP_ULYSSES_GROUP, _DRAFT_DP_GROUP, _DRAFT_SP_GROUP + _DEVICE_MESH = device_mesh + _TP_GROUP = tp_group + _TP_DEVICE_MESH = tp_device_mesh + _SP_ULYSSES_GROUP = sp_ulysses_group + _SP_RING_GROUP = sp_ring_group + _DP_GROUP = dp_group + _DRAFT_DP_GROUP = draft_device_mesh.get_group("draft_dp") + _DRAFT_SP_GROUP = draft_device_mesh.get_group("sp") + _DP_DEVICE_MESH = dist.DeviceMesh.from_group(dp_group, device_type="cuda") + + +def destroy_distributed(): + global _TP_GROUP, _DP_GROUP, _SP_ULYSSES_GROUP, _SP_RING_GROUP, _DRAFT_DP_GROUP + dist.destroy_process_group(_TP_GROUP) + dist.destroy_process_group(_DP_GROUP) + dist.destroy_process_group(_SP_ULYSSES_GROUP) + dist.destroy_process_group(_SP_RING_GROUP) + dist.destroy_process_group(_DRAFT_DP_GROUP) + dist.destroy_process_group(_DRAFT_SP_GROUP) + dist.destroy_process_group() + + +def shard_tensor( + tensor: torch.Tensor, process_group: dist.ProcessGroup = None, dim: int = -1 +) -> torch.Tensor: + rank = dist.get_rank(process_group) + size = dist.get_world_size(process_group) + return tensor.chunk(size, dim=dim)[rank].contiguous() + + +def gather_tensor( + tensor: torch.Tensor, process_group: dist.ProcessGroup = None, dim: int = -1 +) -> torch.Tensor: + size = dist.get_world_size(process_group) + obj_list = [torch.empty_like(tensor) for _ in range(size)] + dist.all_gather(obj_list, tensor, group=process_group) + gather_tensor = torch.cat(obj_list, dim=dim) + return gather_tensor + + +def all_gather_tensor( + local_tensor: torch.Tensor, + group: Optional[dist.ProcessGroup] = None, + async_op: bool = False, +): + sp_world_size = dist.get_world_size(group=group) + output_shape = list(local_tensor.shape) + output_shape[0] = output_shape[0] * sp_world_size + output = torch.empty( + output_shape, dtype=local_tensor.dtype, device=local_tensor.device + ) + dist.all_gather_into_tensor(output, local_tensor, group=group, async_op=async_op) + return output + + +# Adapted from https://github.com/volcengine/verl/blob/a0e8e4472b8b472409defb0c8fcc5162301450af/verl/utils/ulysses.py#L194 +class Gather(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + group: dist.ProcessGroup, + local_tensor: torch.Tensor, + gather_dim: int, + grad_scaler: bool = True, + async_op=False, + ) -> torch.Tensor: + ctx.group = group + ctx.gather_dim = gather_dim + ctx.grad_scaler = grad_scaler + ctx.async_op = async_op + + sp_world_size = dist.get_world_size(group=group) + ctx.sp_world_size = sp_world_size + + sp_rank = dist.get_rank(group=group) + ctx.sp_rank = sp_rank + + local_shape = list(local_tensor.size()) + split_size = local_shape[0] + part_size = local_shape[gather_dim] # store original size + ctx.part_size = part_size + + output = all_gather_tensor(local_tensor, group, async_op) + return torch.cat(output.split(split_size, dim=0), dim=gather_dim) + + @staticmethod + def backward(ctx: Any, grad_output: torch.Tensor) -> Any: + if ctx.grad_scaler: + grad_output = grad_output * ctx.sp_world_size + return ( + None, + grad_output.split(ctx.part_size, dim=ctx.gather_dim)[ + ctx.sp_rank + ].contiguous(), + None, + None, + None, + None, + ) + + +def gather_outputs_and_unpad( + x: torch.Tensor, + gather_dim: int, + grad_scaler: bool = True, + group: Optional[dist.ProcessGroup] = None, +): + """ + Gather a tensor across a process group and optionally unpad its padded elements. + + Args: + x (Tensor): Input tensor to gather. + gather_dim (int): Dimension along which to gather across ranks. + grad_scaler (bool): Whether to apply gradient scaling during gather. Defaults to True. + group (ProcessGroup, optional): Process group for gathering. If None, uses + `get_ulysses_sequence_parallel_group()`. If still None, returns `x` unchanged. + + Returns: + Tensor: The gathered tensor, with padding removed if requested. + """ + if not group: + group = get_draft_sp_group() + if torch.distributed.get_world_size(group) == 1: + return x + x = Gather.apply(group, x, gather_dim, grad_scaler) + return x + + +def is_tp_rank_0(): + """Return True if current process is rank 0 in its TP group.""" + tp_group = get_tp_group() + if tp_group is None: + return True + return dist.get_rank(group=tp_group) == 0 diff --git a/SpecForge/specforge/lr_scheduler.py b/SpecForge/specforge/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..9aeb241631c9d573ec8e15a1f7dc07f4bef4b609 --- /dev/null +++ b/SpecForge/specforge/lr_scheduler.py @@ -0,0 +1,271 @@ +from warnings import warn + +from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR +from torch.optim.lr_scheduler import LRScheduler as _LRScheduler + + +class _enable_get_lr_call: + def __init__(self, o): + self.o = o + + def __enter__(self): + self.o._get_lr_called_within_step = True + return self + + def __exit__(self, type, value, traceback): + self.o._get_lr_called_within_step = False + + +class TwoStageScheduler(_LRScheduler): + def __init__(self, optimizer, after_scheduler: _LRScheduler, last_epoch=-1): + self.after_scheduler = after_scheduler + self.finished = False + super().__init__(optimizer, last_epoch) + + def state_dict(self): + state_dict = { + key: value for key, value in self.__dict__.items() if key not in "optimizer" + } + if isinstance(state_dict["after_scheduler"], _LRScheduler): + state_dict["after_scheduler_type"] = type( + state_dict["after_scheduler"] + ).__name__ + state_dict["after_scheduler_dict"] = state_dict[ + "after_scheduler" + ].state_dict() + del state_dict["after_scheduler"] + else: + raise NotImplementedError() + return state_dict + + def load_state_dict(self, state_dict): + # Save _last_lr before it gets filtered out + last_lr = state_dict.get("_last_lr", None) + + if "after_scheduler_dict" not in state_dict: + warn( + "after_scheduler_dict is not found, skip loading after_scheduler. This may cause unexpected behavior." + ) + else: + self.after_scheduler.load_state_dict(state_dict["after_scheduler_dict"]) + state_dict = { + key: value + for key, value in state_dict.items() + if key not in ("after_scheduler_type", "after_scheduler_dict") + } + super().load_state_dict(state_dict) + + # Restore optimizer's lr from _last_lr to ensure consistency + # This is critical because PyTorch's CosineAnnealingLR.get_lr() uses + # group["lr"] to compute the next lr, but load_state_dict doesn't + # update the optimizer's lr automatically. + if last_lr is not None: + for param_group, lr in zip(self.optimizer.param_groups, last_lr): + param_group["lr"] = lr + + +class DelayerScheduler(TwoStageScheduler): + """Starts with a flat lr schedule until it reaches N epochs then applies + the specific scheduler (For example: ReduceLROnPlateau) + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler. + after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1): + if delay_epochs < 0: + raise ValueError(f"delay_epochs must >= 0, got {delay_epochs}") + self.delay_epochs = delay_epochs + super().__init__(optimizer, after_scheduler, last_epoch) + + def get_lr(self): + if self.last_epoch >= self.delay_epochs: + if not self.finished: + self.after_scheduler.base_lrs = self.base_lrs + self.finished = True + with _enable_get_lr_call(self.after_scheduler): + return self.after_scheduler.get_lr() + + return self.base_lrs + + def step(self, epoch=None): + if self.finished: + if epoch is None: + self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() + else: + self.after_scheduler.step(epoch - self.delay_epochs) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super(DelayerScheduler, self).step(epoch) + + +class WarmupScheduler(TwoStageScheduler): + """Starts with a linear warmup lr schedule until it reaches N epochs then applies + the specific scheduler (For example: ReduceLROnPlateau). + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler. + after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1): + self.warmup_epochs = int(warmup_epochs) + super().__init__(optimizer, after_scheduler, last_epoch) + + def get_lr(self): + if self.last_epoch >= self.warmup_epochs: + if not self.finished: + self.after_scheduler.base_lrs = self.base_lrs + self.finished = True + return self.after_scheduler.get_lr() + + return [(self.last_epoch + 1) / self.warmup_epochs * lr for lr in self.base_lrs] + + def step(self, epoch=None): + if self.finished: + if epoch is None: + self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() + else: + self.after_scheduler.step(epoch - self.warmup_epochs) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super().step(epoch) + + +class WarmupDelayerScheduler(TwoStageScheduler): + """Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule + until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau). + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler. + delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler. + after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__( + self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1 + ): + if delay_epochs < 0: + raise ValueError(f"delay_epochs must >= 0, got {delay_epochs}") + if warmup_epochs < 0: + raise ValueError(f"warmup_epochs must >= 0, got {warmup_epochs}") + self.warmup_epochs = warmup_epochs + self.delay_epochs = delay_epochs + super().__init__(optimizer, after_scheduler, last_epoch) + + def get_lr(self): + if self.last_epoch >= self.warmup_epochs + self.delay_epochs: + if not self.finished: + self.after_scheduler.base_lrs = self.base_lrs + # reset lr to base_lr + for group, base_lr in zip(self.optimizer.param_groups, self.base_lrs): + group["lr"] = base_lr + self.finished = True + with _enable_get_lr_call(self.after_scheduler): + return self.after_scheduler.get_lr() + elif self.last_epoch >= self.warmup_epochs: + return self.base_lrs + + return [(self.last_epoch + 1) / self.warmup_epochs * lr for lr in self.base_lrs] + + def step(self, epoch=None): + if self.finished: + if epoch is None: + self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() + else: + self.after_scheduler.step(epoch - self.warmup_epochs) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super().step(epoch) + + +class CosineAnnealingLR(_CosineAnnealingLR): + r"""Set the learning rate of each parameter group using a cosine annealing + schedule, where :math:`\eta_{max}` is set to the initial lr and + :math:`T_{cur}` is the number of epochs since the last restart in SGDR: + + .. math:: + \begin{aligned} + \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 + + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right), + & T_{cur} \neq (2k+1)T_{max}; \\ + \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min}) + \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right), + & T_{cur} = (2k+1)T_{max}. + \end{aligned} + + When last_epoch=-1, sets initial lr as lr. Notice that because the schedule + is defined recursively, the learning rate can be simultaneously modified + outside this scheduler by other operators. If the learning rate is set + solely by this scheduler, the learning rate at each step becomes: + + .. math:: + \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 + + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right) + + It has been proposed in + `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only + implements the cosine annealing part of SGDR, and not the restarts. + + .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: + https://arxiv.org/abs/1608.03983 + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + total_steps (int): Number of total training steps. + eta_min (int, optional): Minimum learning rate, defaults to 0. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__( + self, + optimizer, + total_steps: int, + eta_min: int = 0, + last_epoch: int = -1, + **kwargs, + ): + super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch) + + +class CosineAnnealingWarmupLR(WarmupScheduler): + """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied. + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + total_steps (int): Number of total training steps. + warmup_steps (int, optional): Number of warmup steps, defaults to 0. + eta_min (int, optional): Minimum learning rate, defaults to 0. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__( + self, + optimizer, + total_steps: int, + warmup_steps: int = 0, + eta_min: float = 0.0, + last_epoch: int = -1, + ): + base_scheduler = _CosineAnnealingLR( + optimizer, + total_steps - warmup_steps, + eta_min=eta_min, + last_epoch=last_epoch, + ) + super().__init__(optimizer, warmup_steps, base_scheduler, last_epoch=last_epoch) diff --git a/SpecForge/specforge/optimizer.py b/SpecForge/specforge/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdd3ab8dd9f2960e3612da50469ba13792df83a --- /dev/null +++ b/SpecForge/specforge/optimizer.py @@ -0,0 +1,66 @@ +import torch + +from specforge.lr_scheduler import CosineAnnealingWarmupLR +from specforge.utils import print_on_rank0 + + +class BF16Optimizer: + def __init__( + self, + model, + lr, + weight_decay=0.0, + max_grad_norm=0.5, + total_steps=800_000, + warmup_ratio=0.015, + ): + # TODO: For now, we only support cosine annealing warmup lr scheduler and AdamW optimizer + # TODO: We should make these parameters configurable + # These magic numbers: weight_decay=0.0, max_grad_norm=0.5, total_steps=800k, warmup_steps=12k are copied from + # https://github.com/SafeAILab/EAGLE/blob/main/eagle/traineagle3/ds_config.json + self.model = model + self.model_params = [p for p in model.parameters() if p.requires_grad] + self.max_grad_norm = max_grad_norm + self.fp32_params = [ + p.detach().clone().to(torch.float32) for p in self.model_params + ] + for mp in self.fp32_params: + mp.requires_grad = True + self.optimizer = torch.optim.AdamW( + self.fp32_params, lr=lr, weight_decay=weight_decay + ) + self.scheduler = CosineAnnealingWarmupLR( + self.optimizer, + total_steps=total_steps, + warmup_steps=int(warmup_ratio * total_steps), + ) + + def step(self): + with torch.no_grad(): + for p, mp in zip(self.model_params, self.fp32_params): + mp.grad = ( + p.grad.detach().to(torch.float32) if p.grad is not None else None + ) + torch.nn.utils.clip_grad_norm_(self.fp32_params, self.max_grad_norm) + self.optimizer.step() + self.optimizer.zero_grad() + self.scheduler.step() + with torch.no_grad(): + for p, mp in zip(self.model_params, self.fp32_params): + p.data.copy_(mp.data.to(p.dtype)) + p.grad = None + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict["optimizer_state_dict"]) + print_on_rank0("Successfully loaded optimizer state_dict.") + self.scheduler.load_state_dict(state_dict["scheduler_state_dict"]) + print_on_rank0("Successfully loaded scheduler state_dict.") + + def state_dict(self): + return { + "optimizer_state_dict": self.optimizer.state_dict(), + "scheduler_state_dict": self.scheduler.state_dict(), + } + + def get_learning_rate(self): + return self.optimizer.param_groups[0]["lr"] diff --git a/SpecForge/specforge/tracker.py b/SpecForge/specforge/tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..b91794ce1418fe9360e3537211b2b8e4e1b53899 --- /dev/null +++ b/SpecForge/specforge/tracker.py @@ -0,0 +1,319 @@ +# tracker.py + +import abc +import netrc +import os +from typing import Any, Dict, Optional + +import torch.distributed as dist + +# --- Lazy Imports --- +# These libraries are imported only when their respective trackers are used. +try: + import wandb +except ImportError: + wandb = None + +try: + from torch.utils.tensorboard import SummaryWriter +except ImportError: + SummaryWriter = None + +try: + import swanlab +except ImportError: + swanlab = None + +try: + import mlflow +except ImportError: + mlflow = None + + +# --- End Lazy Imports --- + + +class Tracker(abc.ABC): + """ + Abstract Base Class for experiment trackers. + + Each tracker implementation should handle its own initialization, logging, + and cleanup. It should also provide a class method to validate + command-line arguments before initialization. + """ + + def __init__(self, args, output_dir: str): + self.args = args + self.output_dir = output_dir + self.rank = dist.get_rank() + self.is_initialized = False + + @classmethod + @abc.abstractmethod + def validate_args(cls, parser, args) -> None: + """ + Validate necessary arguments for this tracker. + This method is called during argument parsing. + It should raise an error if required arguments are missing. + """ + + @abc.abstractmethod + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None) -> None: + """ + Log metrics to the tracker. + """ + + @abc.abstractmethod + def close(self) -> None: + """ + Close the tracker and clean up resources. + """ + + +class NoOpTracker(Tracker): + """A tracker that does nothing, for when no tracking is desired.""" + + @classmethod + def validate_args(cls, parser, args): + pass # No arguments to validate + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + self.is_initialized = True # Considered initialized to do nothing + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + pass # Do nothing + + def close(self): + pass # Do nothing + + +class WandbTracker(Tracker): + """Tracks experiments using Weights & Biases.""" + + @staticmethod + def _default_wandb_dir() -> str: + # specforge/tracker.py -> project root is one level up + return os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "wandb")) + + @classmethod + def validate_args(cls, parser, args): + if wandb is None: + parser.error( + "To use --report-to wandb, you must install wandb: 'pip install wandb'" + ) + + if args.wandb_dir is None: + args.wandb_dir = cls._default_wandb_dir() + + if args.wandb_offline: + return + + if args.wandb_key is not None: + return + + if "WANDB_API_KEY" in os.environ: + args.wandb_key = os.environ["WANDB_API_KEY"] + return + + try: + netrc_path = os.path.expanduser("~/.netrc") + if os.path.exists(netrc_path): + netrc_file = netrc.netrc(netrc_path) + if "api.wandb.ai" in netrc_file.hosts: + _, _, password = netrc_file.authenticators("api.wandb.ai") + if password: + args.wandb_key = password + return + except (FileNotFoundError, netrc.NetrcParseError): + pass + + if args.wandb_key is None: + parser.error( + "When --report-to is 'wandb', you must provide a wandb API key via one of:\n" + " 1. --wandb-key argument\n" + " 2. WANDB_API_KEY environment variable\n" + " 3. `wandb login` command" + ) + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + if args.wandb_dir is None: + args.wandb_dir = self._default_wandb_dir() + os.makedirs(args.wandb_dir, exist_ok=True) + + if not args.wandb_offline: + wandb.login(key=args.wandb_key) + init_kwargs = { + "project": args.wandb_project, + "name": args.wandb_name, + "config": vars(args), + "dir": args.wandb_dir, + } + if args.wandb_offline: + init_kwargs["mode"] = "offline" + wandb.init(**init_kwargs) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + wandb.log(log_dict, step=step) + + def close(self): + if self.rank == 0 and self.is_initialized and wandb.run: + wandb.finish() + self.is_initialized = False + + +class SwanlabTracker(Tracker): + """Tracks experiments using SwanLab.""" + + @classmethod + def validate_args(cls, parser, args): + if swanlab is None: + parser.error( + "To use --report-to swanlab, you must install swanlab: 'pip install swanlab'" + ) + + if args.swanlab_key is not None: + return + if "SWANLAB_API_KEY" in os.environ: + args.swanlab_key = os.environ["SWANLAB_API_KEY"] + return + # Swanlab can run in anonymous mode if no key is provided in a non-distributed env. + # However, a key is often required for distributed runs to sync correctly. + if ( + dist.is_initialized() + and dist.get_world_size() > 1 + and args.swanlab_key is None + ): + parser.error( + "In a distributed environment, when --report-to is 'swanlab', you must provide a swanlab API key via:\n" + " 1. --swanlab-key argument\n" + " 2. SWANLAB_API_KEY environment variable" + ) + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + if args.swanlab_key: + swanlab.login(api_key=args.swanlab_key) + + swanlog_dir = os.path.join(output_dir, "swanlog") + os.makedirs(swanlog_dir, exist_ok=True) + swanlab.init( + project=args.swanlab_project, + experiment_name=args.swanlab_name, + config=vars(args), + logdir=swanlog_dir, + ) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + swanlab.log(log_dict, step=step) + + def close(self): + if self.rank == 0 and self.is_initialized and swanlab.get_run() is not None: + swanlab.finish() + self.is_initialized = False + + +class TensorboardTracker(Tracker): + """Tracks experiments using TensorBoard.""" + + @classmethod + def validate_args(cls, parser, args): + if SummaryWriter is None: + parser.error( + "To use --report-to tensorboard, you must have tensorboard installed: 'pip install tensorboard'" + ) + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + log_dir = os.path.join(output_dir, "runs") + self.writer = SummaryWriter(log_dir=log_dir) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + for key, value in log_dict.items(): + if isinstance(value, (int, float)): + self.writer.add_scalar(key, value, global_step=step) + + def close(self): + if self.rank == 0 and self.is_initialized: + self.writer.close() + self.is_initialized = False + + +class MLflowTracker(Tracker): + """Tracks experiments using MLflow.""" + + @classmethod + def validate_args(cls, parser, args): + if mlflow is None: + parser.error( + "To use --report-to mlflow, you must install mlflow: 'pip install mlflow'" + ) + # Set tracking URI from environment variable if not explicitly provided + if args.mlflow_tracking_uri is None and "MLFLOW_TRACKING_URI" in os.environ: + args.mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"] + elif args.mlflow_tracking_uri is None: + print( + "Warning: MLflow tracking URI not set. Defaulting to local './mlruns'." + ) + + # Set experiment name from environment variable if not explicitly provided + if ( + args.mlflow_experiment_name is None + and "MLFLOW_EXPERIMENT_NAME" in os.environ + ): + args.mlflow_experiment_name = os.environ["MLFLOW_EXPERIMENT_NAME"] + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + if args.mlflow_tracking_uri: + mlflow.set_tracking_uri(args.mlflow_tracking_uri) + + # This will either use the set URI or the default + mlflow.set_experiment(args.mlflow_experiment_name) + mlflow.start_run(run_name=args.mlflow_run_name) + mlflow.log_params(vars(args)) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + # MLflow's log_metrics takes a dictionary directly + mlflow.log_metrics(log_dict, step=step) + + def close(self): + if self.rank == 0 and self.is_initialized: + mlflow.end_run() + self.is_initialized = False + + +# --- Tracker Factory --- +TRACKER_REGISTRY = { + "wandb": WandbTracker, + "swanlab": SwanlabTracker, + "tensorboard": TensorboardTracker, + "mlflow": MLflowTracker, + "none": NoOpTracker, +} + + +def get_tracker_class(report_to: str) -> Optional[Tracker]: + """Returns the tracker class based on the name.""" + return TRACKER_REGISTRY.get(report_to) + + +def create_tracker(args, output_dir: str) -> Tracker: + """Factory function to create an experiment tracker instance.""" + tracker_class = get_tracker_class(args.report_to) + if not tracker_class: + raise ValueError(f"Unsupported report_to type: {args.report_to}") + return tracker_class(args, output_dir) diff --git a/SpecForge/specforge/utils.py b/SpecForge/specforge/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af4d627c8f7512721c65dc07b92b767abf2f418e --- /dev/null +++ b/SpecForge/specforge/utils.py @@ -0,0 +1,412 @@ +import json +import logging +import os +import re +from contextlib import contextmanager + +import torch +import torch.distributed as dist +from torch.distributed._tensor import DTensor, Shard, distribute_tensor +from transformers import AutoConfig, PretrainedConfig + +logger = logging.getLogger(__name__) + + +@contextmanager +def rank_0_priority(): + rank = dist.get_rank() + + if rank == 0: + yield + dist.barrier() + else: + dist.barrier() + yield + + +@contextmanager +def default_torch_dtype(dtype: torch.dtype): + current_dtype = torch.get_default_dtype() + torch.set_default_dtype(dtype) + yield + torch.set_default_dtype(current_dtype) + + +@torch.no_grad() +def padding(tensor, left=True): + zeropadding = torch.zeros_like(tensor[:, -1:]) + if left: + tensor = torch.cat((zeropadding, tensor[:, :-1]), dim=1) + else: + tensor = torch.cat((tensor[:, 1:], zeropadding), dim=1) + return tensor + + +def load_config_from_file(config_path: str): + with open(config_path, "r") as f: + config = json.load(f) + + return PretrainedConfig.from_dict(config) + + +def print_with_rank(message): + if dist.is_available() and dist.is_initialized(): + logger.info(f"rank {dist.get_rank()}: {message}") + else: + logger.info(f"non-distributed: {message}") + + +def print_args_with_dots(args): + if dist.get_rank() == 0: + args_dict = vars(args) + max_key_length = max(len(key) for key in args_dict.keys()) + total_width = 50 + + print("\n -----------【args】-----------") + for key, value in args_dict.items(): + key_str = f"{key:<{max_key_length}}" + value_str = str(value) + dot_count = total_width - len(key_str) - len(value_str) + dot_fill = "·" * dot_count + print(f"{key_str} {dot_fill} {value_str}") + + +def print_on_rank0(message): + if dist.get_rank() == 0: + logger.info(message) + + +def get_last_checkpoint(folder, prefix="epoch"): + """ + Get the latest checkpoint directory along with its epoch and step information. + + Args: + folder: The folder path containing checkpoints. + prefix: The prefix for checkpoint directories, default is "epoch". + + Returns: + tuple: (checkpoint_path, epoch, step) + - Returns (None, None, None) if no checkpoint is found. + - step is 0 if not present in the directory name. + """ + content = os.listdir(folder) + # Match: epoch_X or epoch_X_step_Y + _re_checkpoint = re.compile(rf"^{re.escape(prefix)}_(\d+)(?:_step_(\d+))?$") + + checkpoints = [ + path + for path in content + if _re_checkpoint.search(path) is not None + and os.path.isdir(os.path.join(folder, path)) + ] + + if len(checkpoints) == 0: + return None, (0, 0) + + # Sort key: (epoch, step), step=0 when not present + def sort_key(x): + match = _re_checkpoint.search(x) + epoch = int(match.group(1)) + step = int(match.group(2)) if match.group(2) else 0 + return (epoch, step) + + last_checkpoint = max(checkpoints, key=sort_key) + match = _re_checkpoint.search(last_checkpoint) + epoch = int(match.group(1)) + step = int(match.group(2)) if match.group(2) else 0 + + return os.path.join(folder, last_checkpoint), (epoch, step) + + +def generate_draft_model_config( + target_model_path: str, template_config_path: str = None, cache_dir: str = None +): + """ + Auto-generate draft model config based on target model parameters aligned with template config + + Args: + target_model_path (str): Path to the target model + template_config_path (str, optional): Template config file path, defaults to llama3-8B-eagle3.json + cache_dir (str, optional): Cache directory + + Returns: + dict: Generated draft model config dictionary + """ + # Get target model config + target_config = AutoConfig.from_pretrained(target_model_path, cache_dir=cache_dir) + + # If no template specified, use default llama3-8B-eagle3.json + if template_config_path is None: + # Use the script execution directory as base + import sys + + script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + project_root = os.path.dirname(script_dir) # Go up one level from scripts/ + template_config_path = os.path.join( + project_root, "configs", "llama3-8B-eagle3.json" + ) + + # Read template config + with open(template_config_path, "r") as f: + draft_config = json.load(f) + + # Adjust architecture config based on target model type + if hasattr(target_config, "model_type"): + # Default to llama architecture + draft_config["model_type"] = "llama" + + # Align key parameters + param_mappings = { + "vocab_size": "vocab_size", + "hidden_size": "hidden_size", + "num_attention_heads": "num_attention_heads", + "num_key_value_heads": "num_key_value_heads", + "intermediate_size": "intermediate_size", + "max_position_embeddings": "max_position_embeddings", + "rms_norm_eps": "rms_norm_eps", + "hidden_act": "hidden_act", + "bos_token_id": "bos_token_id", + "eos_token_id": "eos_token_id", + "torch_dtype": "torch_dtype", + } + + # Copy parameters from target model to draft config + for target_param, draft_param in param_mappings.items(): + if hasattr(target_config, target_param): + value = getattr(target_config, target_param) + # Special handling for torch_dtype to make it JSON serializable + if target_param == "torch_dtype" and isinstance(value, torch.dtype): + value = str(value).replace("torch.", "") + draft_config[draft_param] = value + + # Special handling for some parameters + # Ensure num_hidden_layers is always 1 (EAGLE3 feature) + draft_config["num_hidden_layers"] = 1 + + # Keep some fixed draft model specific parameters + draft_config["tie_word_embeddings"] = False + draft_config["use_cache"] = True + + # If template doesn't have draft_vocab_size, set default + if "draft_vocab_size" not in draft_config: + draft_config["draft_vocab_size"] = 32000 # Default value + + return draft_config + + +def save_draft_model_config(config_dict: dict, output_path: str): + """ + Save draft model config to file + + Args: + config_dict (dict): Config dictionary + output_path (str): Output file path + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(config_dict, f, indent=2, ensure_ascii=False) + + print(f"Draft model config saved to: {output_path}") + + +def create_draft_config_from_target( + target_model_path: str, + output_dir: str = None, + template_config_path: str = None, + cache_dir: str = None, +): + """ + Convenient function to create draft model config file from target model + + Args: + target_model_path (str): Target model path + output_dir (str, optional): Output directory, defaults to configs folder in current directory + template_config_path (str, optional): Template config path + cache_dir (str, optional): Cache directory + + Returns: + str: Generated config file path + """ + # Generate config + rank = dist.get_rank() + + if rank == 0: + print_with_rank( + "No draft model config provided, auto-generating from target model..." + ) + config_dict = generate_draft_model_config( + target_model_path, template_config_path, cache_dir + ) + dist.barrier() + + # Determine output path + if output_dir is None: + # Use the script execution directory as base + import sys + + script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + project_root = os.path.dirname(script_dir) # Go up one level from scripts/ + output_dir = os.path.join(project_root, "configs") + + # Extract model name from model path + model_name = target_model_path.split("/")[-1].lower() + output_filename = f"{model_name}-eagle3-auto.json" + output_path = os.path.join(output_dir, output_filename) + + # Save config + if rank == 0: + save_draft_model_config(config_dict, output_path) + print_with_rank(f"Auto-generated draft model config saved to: {output_path}") + dist.barrier() + + return output_path + + +def get_full_optimizer_state(optimizer_state_dict: dict): + """ + Convert optimizer state dict with DTensor to full tensors for saving + + Args: + optimizer_state_dict (dict): Optimizer state dict possibly containing DTensors + Returns: + dict: Optimizer state dict with full tensors + """ + full_optimizer_state_dict = { + k: v for k, v in optimizer_state_dict.items() if k != "state" + } + if "state" in optimizer_state_dict: + full_optimizer_state_dict["state"] = { + param_id: { + state_key: ( + state_tensor.full_tensor() + if isinstance(state_tensor, torch.distributed.tensor.DTensor) + else state_tensor + ) + for state_key, state_tensor in param_state.items() + } + for param_id, param_state in optimizer_state_dict["state"].items() + } + return full_optimizer_state_dict + + +def shard_optimizer_state_with_dtensor(bf16_optimizer, device_mesh): + """ + Shards the optimizer state tensors of a BF16Optimizer instance using DTensor. + + Args: + bf16_optimizer (BF16Optimizer): An instance of BF16Optimizer, which contains + the actual optimizer (e.g., torch.optim.Adam) as its `.optimizer` attribute. + """ + + optim = bf16_optimizer.optimizer + + for group in optim.param_groups: + for p in group["params"]: + if not isinstance(p, DTensor): + continue + + state = optim.state.get(p, None) + if state is None: + continue + + mesh = device_mesh + placements = (Shard(dim=0),) + + for k, v in list(state.items()): + if k == "step": + continue + + if isinstance(v, DTensor): + continue + + if not isinstance(v, torch.Tensor): + continue + + state[k] = distribute_tensor( + v.to(p.device), device_mesh=mesh, placements=placements + ) + + +def safe_conversations_generator(file_path): + """ + Generator that: + 1. Extracts the 'conversations' field. + 2. Preserves all original fields within each message. + 3. [Key step] Converts all list/dict-type field values to strings to resolve mixed-type conflicts (e.g., for Arrow compatibility). + """ + with open(file_path, "r", encoding="utf-8") as f: + for i, line in enumerate(f): + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + raw_convs = row.get("conversations", []) + + # 1. Ensure 'conversations' is a list + if not isinstance(raw_convs, list): + # If it's None or some unexpected type, treat as empty or skip + if raw_convs is None: + raw_convs = [] + else: + # Edge case: 'conversations' is a plain string or non-iterable—skip this line + logger.warning( + f"Line {i + 1}: 'conversations' is not a list. Please check!" + ) + continue + + cleaned_convs = [] + for msg in raw_convs: + # 2. Ensure each item in the list is a dictionary + if not isinstance(msg, dict): + # Skip if an element is not a dict (e.g., malformed like ["user", "hi"]) + continue + + # 3. [Core logic] Iterate over all fields in the message (role, content, tools, etc.) + new_msg = {} + for k, v in msg.items(): + # If the value is a list or dict, serialize it to a JSON string + # This ensures Arrow treats the column as string type instead of list/struct + if isinstance(v, (list, dict)): + new_msg[k] = json.dumps(v, ensure_ascii=False) + else: + # Keep primitive types (str, int, float, bool, None) unchanged + new_msg[k] = v + + cleaned_convs.append(new_msg) + + # Build result with conversations + result = {"conversations": cleaned_convs} + + # Preserve 'tools' field if present + if "tools" in row: + tools = row["tools"] + if tools is not None: + # If tools is a JSON string, parse it first + if isinstance(tools, str): + try: + tools = json.loads(tools) + except json.JSONDecodeError: + logger.warning( + f"Line {i + 1}: 'tools' is a string but not valid JSON, keeping as-is" + ) + result["tools"] = tools + yield result + continue + + # Serialize tools to JSON string for Arrow compatibility + # (same treatment as list/dict fields in conversations) + if isinstance(tools, (list, dict)): + result["tools"] = json.dumps(tools, ensure_ascii=False) + else: + # Primitive type, keep as-is + result["tools"] = tools + else: + result["tools"] = [] + + yield result + + except Exception as e: + logger.warning(f"Skipping line {i + 1}: {e}") + continue diff --git a/SpecForge/tests/__init__.py b/SpecForge/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SpecForge/tests/utils.py b/SpecForge/tests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d1cc20609907eeacf62c7e76be4bee5e1caf1ea2 --- /dev/null +++ b/SpecForge/tests/utils.py @@ -0,0 +1,107 @@ +import os +import socket +import subprocess +import time + +import requests +from sglang.utils import print_highlight + + +def is_port_in_use(port: int) -> bool: + """Check if a port is in use""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind(("localhost", port)) + return False + except OSError: + return True + + +def get_available_port(): + # get a random available port + # and try to find a port that is not in use + for port in range(10000, 65535): + if not is_port_in_use(port): + return port + raise RuntimeError("No available port found") + + +def execute_shell_command( + command: str, disable_proxy: bool = False, enable_hf_mirror: bool = False +): + """ + Execute a shell command and return its process handle. + """ + command = command.replace("\\\n", " ").replace("\\", " ") + parts = command.split() + env = os.environ.copy() + + if disable_proxy: + env.pop("http_proxy", None) + env.pop("https_proxy", None) + env.pop("no_proxy", None) + env.pop("HTTP_PROXY", None) + env.pop("HTTPS_PROXY", None) + env.pop("NO_PROXY", None) + + if enable_hf_mirror: + env["HF_ENDPOINT"] = "https://hf-mirror.com" + return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT, env=env) + + +def wait_for_server( + base_url: str, timeout: int = None, disable_proxy: bool = False +) -> None: + """Wait for the server to be ready by polling the /v1/models endpoint. + + Args: + base_url: The base URL of the server + timeout: Maximum time to wait in seconds. None means wait forever. + """ + start_time = time.perf_counter() + + if disable_proxy: + http_proxy = os.environ.pop("http_proxy", None) + https_proxy = os.environ.pop("https_proxy", None) + no_proxy = os.environ.pop("no_proxy", None) + http_proxy_capitalized = os.environ.pop("HTTP_PROXY", None) + https_proxy_capitalized = os.environ.pop("HTTPS_PROXY", None) + no_proxy_capitalized = os.environ.pop("NO_PROXY", None) + + while True: + try: + response = requests.get( + f"{base_url}/v1/models", + headers={"Authorization": "Bearer None"}, + ) + if response.status_code == 200: + time.sleep(5) + print_highlight( + """\n + NOTE: Typically, the server runs in a separate terminal. + In this notebook, we run the server and notebook code together, so their outputs are combined. + To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue. + To reduce the log length, we set the log level to warning for the server, the default log level is info. + We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance. + """ + ) + break + + if timeout and time.perf_counter() - start_time > timeout: + raise TimeoutError("Server did not become ready within timeout period") + except requests.exceptions.RequestException: + time.sleep(1) + + if disable_proxy: + if http_proxy: + os.environ["http_proxy"] = http_proxy + if https_proxy: + os.environ["https_proxy"] = https_proxy + if no_proxy: + os.environ["no_proxy"] = no_proxy + if http_proxy_capitalized: + os.environ["HTTP_PROXY"] = http_proxy_capitalized + if https_proxy_capitalized: + os.environ["HTTPS_PROXY"] = https_proxy_capitalized + if no_proxy_capitalized: + os.environ["NO_PROXY"] = no_proxy_capitalized diff --git a/idea1/.devcontainer/Dockerfile b/idea1/.devcontainer/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..8ffb0d0328f12064b311869d19aa60df32cd7484 --- /dev/null +++ b/idea1/.devcontainer/Dockerfile @@ -0,0 +1,32 @@ +FROM lmsysorg/sglang:dev + +# Create non-root user with specified UID and GID +# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908. +ARG HOST_UID=1003 +ARG HOST_GID=1003 +RUN groupadd -g $HOST_GID devuser && \ + useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser + +# Give devuser sudo access +RUN apt-get update && apt-get install -y sudo && \ + echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \ + rm -rf /var/lib/apt/lists/* && \ + apt-get clean + +# Set up oh-my-zsh for devuser +RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \ + cp /root/.zshrc /home/devuser/.zshrc && \ + cp /root/.vimrc /home/devuser/.vimrc && \ + cp /root/.tmux.conf /home/devuser/.tmux.conf && \ + sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \ + chown -R devuser:devuser /home/devuser/ + +# Set workspace directory and ownership +WORKDIR /sgl-workspace/sglang +RUN chown -R devuser:devuser /sgl-workspace + +# Switch to devuser +USER devuser + +# Install rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y diff --git a/idea1/.devcontainer/devcontainer.json b/idea1/.devcontainer/devcontainer.json new file mode 100644 index 0000000000000000000000000000000000000000..b2dbad2a745763b273af79b742640461f18b7894 --- /dev/null +++ b/idea1/.devcontainer/devcontainer.json @@ -0,0 +1,30 @@ +{ + "name": "sglang", + "build": { + "dockerfile": "Dockerfile" + }, + "remoteUser": "devuser", + "customizations": { + "vscode": { + "extensions": [ + // Python development + "ms-python.python", + "charliermarsh.ruff", + // Rust development + "rust-lang.rust-analyzer", + "tamasfe.even-better-toml" + ] + } + }, + "forwardPorts": [], + "runArgs": [ + "--gpus", + "all" + ], + // The two lines below ensures that your local changes in the sglang + // repo is automatically synced to the sglang pip package installed + // in the dev docker container. You can remove / comment out these + // two lines if you prefer to sync code changes manually. + "workspaceMount": "source=${localWorkspaceFolder},target=/sgl-workspace/specforge,type=bind", + "workspaceFolder": "/sgl-workspace/specforge" +} diff --git a/idea1/.github/CODEOWNERS b/idea1/.github/CODEOWNERS new file mode 100644 index 0000000000000000000000000000000000000000..e4dbc44f0f9b24da1ad6a96eff14abe45f184255 --- /dev/null +++ b/idea1/.github/CODEOWNERS @@ -0,0 +1,11 @@ +.github @FrankLeeeee +/specforge/core @FrankLeeeee +/specforge/data @zyksir @sleepcoo @shuaills +/specforge/layers @FrankLeeeee @FlamingoPg @sleepcoo @shuaills +/specforge/modeling @FlamingoPg @sleepcoo @shuaills @FrankLeeeee +/tests @FrankLeeeee +/assets @FrankLeeeee @zhyncs +/examples @shuaills @sleepcoo @FlamingoPg +/configs @FrankLeeeee @FlamingoPg +/benchmarks @FrankLeeeee +/scripts @shuaills @sleepcoo @FlamingoPg diff --git a/idea1/.github/ISSUE_TEMPLATE/1-bug-report.yaml b/idea1/.github/ISSUE_TEMPLATE/1-bug-report.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41fa058c4aff03bdeb9e04b06e5d2129fd2e57f1 --- /dev/null +++ b/idea1/.github/ISSUE_TEMPLATE/1-bug-report.yaml @@ -0,0 +1,38 @@ +name: 🐞 Bug report +description: Create a report to help us reproduce and fix the bug +title: "[Bug] " +labels: ['Bug'] + +body: +- type: checkboxes + attributes: + label: Checklist + options: + - label: 1. I have searched related issues but cannot get the expected help. + - label: 2. The bug has not been fixed in the latest version. + - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback. + - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/SpecForge/discussions/new/choose Otherwise, it will be closed. + - label: 5. Please use English, otherwise it will be closed. +- type: textarea + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. + validations: + required: true +- type: textarea + attributes: + label: Reproduction + description: | + What command or script did you run? Which **model** are you using? + placeholder: | + A placeholder for the command. + validations: + required: true +- type: textarea + attributes: + label: Environment + description: | + Please provide necessary environment information here. Otherwise the issue will be closed. + placeholder: Environment here. + validations: + required: true diff --git a/idea1/.github/ISSUE_TEMPLATE/2-feature-request.yaml b/idea1/.github/ISSUE_TEMPLATE/2-feature-request.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6fc81989429af5d3096a389db9adb6ec3993d60 --- /dev/null +++ b/idea1/.github/ISSUE_TEMPLATE/2-feature-request.yaml @@ -0,0 +1,23 @@ +name: 🚀 Feature request +description: Suggest an idea for this project +title: "[Feature] " + +body: +- type: checkboxes + attributes: + label: Checklist + options: + - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/SpecForge/discussions/new/choose Otherwise, it will be closed. + - label: 2. Please use English, otherwise it will be closed. +- type: textarea + attributes: + label: Motivation + description: | + A clear and concise description of the motivation of the feature. + validations: + required: true +- type: textarea + attributes: + label: Related resources + description: | + If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful. diff --git a/idea1/.github/pull_request_template.md b/idea1/.github/pull_request_template.md new file mode 100644 index 0000000000000000000000000000000000000000..296468dfb8c84c38784759283db598959572a91f --- /dev/null +++ b/idea1/.github/pull_request_template.md @@ -0,0 +1,30 @@ + + +## Motivation + + + +## Modifications + + + +## Related Issues + + + +## Accuracy Test + + + +## Benchmark & Profiling + + + +## Checklist + +- [ ] Format your code according to the [Code Formatting with Pre-Commit](https://docs.sglang.ai/references/contribution_guide.html#code-formatting-with-pre-commit). +- [ ] Add unit tests as outlined in the [Running Unit Tests](https://docs.sglang.ai/references/contribution_guide.html#running-unit-tests-adding-to-ci). +- [ ] Update documentation / docstrings / example tutorials as needed, according to [Writing Documentation](https://docs.sglang.ai/references/contribution_guide.html#writing-documentation-running-docs-ci). +- [ ] Provide throughput / latency benchmark results and accuracy evaluation results as needed, according to [Benchmark and Profiling](https://docs.sglang.ai/references/benchmark_and_profiling.html) and [Accuracy Results](https://docs.sglang.ai/references/accuracy_evaluation.html). +- [ ] For reviewers: If you haven't made any contributions to this PR and are only assisting with merging the main branch, please remove yourself as a co-author when merging the PR. +- [ ] Please feel free to join our Slack channel at https://sgl-fru7574.slack.com/archives/C09784E3EN6 to discuss your PR. diff --git a/idea1/.github/workflows/lint.yaml b/idea1/.github/workflows/lint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3cf35a6be5986ecd8e9f90cef12a75438e8401d6 --- /dev/null +++ b/idea1/.github/workflows/lint.yaml @@ -0,0 +1,22 @@ +name: Lint + +on: [ pull_request ] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install pre-commit hook + run: | + python -m pip install pre-commit + pre-commit install + + - name: Linting + run: pre-commit run --all-files --show-diff-on-failure diff --git a/idea1/.github/workflows/publish_docs.yaml b/idea1/.github/workflows/publish_docs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..27f4639d2eb35474f4865f57f9031e18df722942 --- /dev/null +++ b/idea1/.github/workflows/publish_docs.yaml @@ -0,0 +1,72 @@ +name: Release Documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + - "version.txt" + workflow_dispatch: + +concurrency: + group: release-docs-${{ github.ref }} + cancel-in-progress: true + +jobs: + deploy-github-pages: + runs-on: ubuntu-latest + if: github.repository == 'sgl-project/specforge' || github.repository == 'sleepcoo/SpecForge' + permissions: + contents: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: docs/spec_bundle/package-lock.json + + - name: Install dependencies + run: | + sudo apt-get update && sudo apt-get install -y pandoc parallel retry + pip install -r docs/requirements.txt + + - name: Build spec bundle dashboard + run: | + # Copy logos to public directory + cp assets/logo.png docs/spec_bundle/public/logo.png + cp docs/_static/imgs/specbundle-logo.png docs/spec_bundle/public/specbundle-logo.png + cd docs/spec_bundle + npm ci + npm run build + # Clean up node_modules to prevent Sphinx from processing them + rm -rf node_modules + cd .. + + - name: Build documentation + run: | + cd docs + make compile + make html + # Copy SpecBundle to root of output directory + mkdir -p _build/html/SpecBundle + cp -r spec_bundle/dist/* _build/html/SpecBundle/ + + - name: Add .nojekyll file + run: | + touch ./docs/_build/html/.nojekyll + + - name: Deploy + uses: peaceiris/actions-gh-pages@v4 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./docs/_build/html diff --git a/idea1/.github/workflows/publish_pypi.yaml b/idea1/.github/workflows/publish_pypi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af118e4341e9500d9c85e4281cf0f0630d286f07 --- /dev/null +++ b/idea1/.github/workflows/publish_pypi.yaml @@ -0,0 +1,33 @@ +name: Publish to PyPI + +on: + workflow_dispatch: + +jobs: + build-n-publish: + if: github.event_name == 'workflow_dispatch' + name: Build and publish Python distributions to PyPI + runs-on: ubuntu-latest + timeout-minutes: 20 + environment: + name: pypi + url: https://pypi.org/p/specforgeee + permissions: + id-token: write + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: '3.11' + + - run: pip install build && python -m build --sdist + + # publish to PyPI if executed on the main branch + - name: Publish package to PyPI + id: publish + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_TOKEN }} + verbose: true diff --git a/idea1/.github/workflows/scripts/delete_gpu_process.sh b/idea1/.github/workflows/scripts/delete_gpu_process.sh new file mode 100644 index 0000000000000000000000000000000000000000..b734835131f1889f1b95e67fd9325a38b4f25b43 --- /dev/null +++ b/idea1/.github/workflows/scripts/delete_gpu_process.sh @@ -0,0 +1,24 @@ +echo "=== Checking GPU Utilization ===" + +# Get GPU indices and their utilization +nvidia-smi --query-gpu=index,utilization.gpu --format=csv,noheader,nounits | while IFS=',' read -r gpu_index utilization; do + gpu_index=$(echo "$gpu_index" | tr -d ' ') + utilization=$(echo "$utilization" | tr -d ' ') + + # Get PIDs running on this GPU + pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader --id="$gpu_index") + + if [ -z "$pids" ]; then + echo " No processes found on GPU $gpu_index." + else + echo " Killing processes on GPU $gpu_index: $pids" + for pid in $pids; do + pid=$(echo "$pid" | tr -d ' ') + echo " Killing PID $pid..." + # kill -9 "$pid" && echo " PID $pid killed." || echo " Failed to kill PID $pid (may need sudo)." + kill -9 $pid + done + fi +done + +echo "" diff --git a/idea1/.github/workflows/test.yaml b/idea1/.github/workflows/test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ef288b747d1cf338550d733c9e0d891ad5a28805 --- /dev/null +++ b/idea1/.github/workflows/test.yaml @@ -0,0 +1,68 @@ +name: PR Test + +on: + pull_request: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: pr-test-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + unit-test: + if: (github.repository == 'sgl-project/SpecForge' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: [self-hosted] + container: + image: lmsysorg/sglang:v0.5.5 # we lock to this version to avoid repeated docker pull + options: --gpus all --shm-size=2g --rm -v /dev/shm --privileged + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Restore cache + run: | + if [ -d /github/home/cache ] && [ ! -z "$(ls -A /github/home/cache/)" ]; then + cp -p -r /github/home/cache ./ + fi + + if [ -d /github/home/sf ] && [ ! -z "$(ls -A /github/home/sf/)" ]; then + cp -p -r /github/home/sf ./ + fi + + - name: Remove flashinfer # this is needed to avoid flashinfer jit compilation makes the program hang + run: | + rm -rf /github/home/.cache/flashinfer + + - name: Install dependencies + shell: bash + run: | + # if sf venv does not exist, create it + if [ ! -d sf ]; then + uv venv sf -p 3.11 + fi + source sf/bin/activate + uv pip install setuptools + MAX_JOBS=8 uv pip install -v ".[fa]" --prerelease=allow --no-build-isolation + + - name: Kill GPU processes + shell: bash + run: | + bash .github/workflows/scripts/delete_gpu_process.sh + + - name: Run test + timeout-minutes: 30 + shell: bash + run: | + source sf/bin/activate + export PYTHONPATH=$PWD + python -m unittest discover -s ./tests -p "test_*.py" -v + + - name: Save cache + run: | + cp -p -r sf /github/home/ + cp -p -r cache /github/home/ diff --git a/idea1/assets/logo.svg b/idea1/assets/logo.svg new file mode 100644 index 0000000000000000000000000000000000000000..7f619f50a0be61ade41e82599a40db2a45b3c376 --- /dev/null +++ b/idea1/assets/logo.svg @@ -0,0 +1,938 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/idea1/benchmarks/README.md b/idea1/benchmarks/README.md new file mode 100644 index 0000000000000000000000000000000000000000..678ce7a257b8bddb2a44373a4eba1ff77595813a --- /dev/null +++ b/idea1/benchmarks/README.md @@ -0,0 +1,67 @@ +# Benchmarking for Speculative Decoding + +## Overview + +We provided a unified script to test the performance of the Speculative Decoding with EAGLE3 algorithm on multiple datasets. You can follow the steps below to run the benchmarks. + +## Run Benchmarks + +### Launch SGLang and Benchmarker Concurrently + +`bench_eagle3.py` can help you launch a SGLang server process and a Benchmarking process concurrently. In this way, you don't have to launch the SGLang server manually, this script will manually handle the SGLang launch under different speculative decoding configurations. Some important arguments are: +- `--model-path`: the path to the target model. +- `--speculative-draft-model-path`: the path to the draft model. +- `--port`: the port to launch the SGLang server. +- `--trust-remote-code`: trust the remote code. +- `--mem-fraction-static`: the memory fraction for the static memory. +- `--tp-size`: the tensor parallelism size. +- `--attention-backend`: the attention backend. +- `--config-list`: the list of speculative decoding configuration to test, the format is `,,,`. +- `--benchmark-list`: the list of benchmarks to test, the format is `::`. + +```shell +python3 bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --port 30000 \ + --trust-remote-code \ + --mem-fraction-static 0.8 \ + --tp-size 1 \ + --attention-backend fa3 \ + --config-list 1,0,0,0 1,3,1,4 \ + --benchmark-list mtbench gsm8k:5 ceval:5:accountant \ + --dtype bfloat16 +``` + +### Launch Benchmarker Independently + +If you want to launch the SGLang server independently, you can use the following command. + +```shell +# you can launch a server +python3 -m sglang.launch_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --mem-fraction-static 0.75 \ + --cuda-graph-max-bs 1 \ + --tp 1 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 30000 \ + --dtype bfloat16 +``` + +Then we can start benchmarking. Note that you should use the same host and port as the one used in the SGLang server. Note that `--skip-launch-server` is required to skip the launch of the SGLang server. + +```bash +python bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --port 30000 \ + --config-list 1,3,1,4 \ + --benchmark-list mtbench:5 ceval:5:accountant gsm8k:5 humaneval:5 math500:5 mtbench:5 aime:1 \ + --skip-launch-server +``` diff --git a/idea1/benchmarks/__init__.py b/idea1/benchmarks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dfec7eeb81271e0d07feafe767881dba5dcf4acd --- /dev/null +++ b/idea1/benchmarks/__init__.py @@ -0,0 +1,3 @@ +""" +Benchmark scripts for speculative decoding evaluation. +""" diff --git a/idea1/benchmarks/bench_eagle3.py b/idea1/benchmarks/bench_eagle3.py new file mode 100644 index 0000000000000000000000000000000000000000..988e108f5e1f8ce82e9ccbeaf1b77a5d741fa816 --- /dev/null +++ b/idea1/benchmarks/bench_eagle3.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +Usage: + +# if you want to run benchmarks directly +# mtbench:20 means only run 20 samples in the dataset +python bench_eagle3.py \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --port 30000 \ + --config-list 1,0,0,0 1,3,1,4 \ + --benchmark-list mtbench:20 \ + --dtype bfloat16 + + +or if you want run sglang alone. + +# launch sglang +python3 -m sglang.launch_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --mem-fraction-static 0.75 \ + --cuda-graph-max-bs 1 \ + --tp 1 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 30000 \ + --dtype bfloat16 + +# then run benchmarks +python bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --port 30000 \ + --config-list 1,0,0,0 \ + --benchmark-list mtbench:80 \ + --dtype bfloat16 \ + --skip-launch-server +""" +import argparse +import json +import os +import time +from dataclasses import asdict +from typing import List + +import requests +from benchmarker import BENCHMARKS +from sglang.srt.server_args import ServerArgs +from sglang.test.test_utils import kill_process_tree, popen_launch_server +from sglang.utils import wait_for_server + + +def parse_args(): + parser = argparse.ArgumentParser() + sglang_group = parser.add_argument_group("sglang") + ServerArgs.add_cli_args(sglang_group) + + # make the follow args a group + benchmark_group = parser.add_argument_group("benchmark") + benchmark_group.add_argument( + "--skip-launch-server", action="store_true", default=False + ) + benchmark_group.add_argument("--timeout-for-server-launch", type=int, default=600) + benchmark_group.add_argument("--num-prompts", type=int, default=80) + benchmark_group.add_argument("--output-dir", type=str, default="./results") + benchmark_group.add_argument( + "--config-list", type=str, nargs="+", default=["1,0,0,0", "1,3,1,4"] + ) + benchmark_group.add_argument( + "--name", + type=str, + default=None, + help="name of this benchmark run, if provided, will be added to the output file name", + ) + benchmark_group.add_argument( + "--benchmark-list", + type=str, + nargs="+", + default=[ + "mtbench:80", + "gsm8k:200", + "humaneval:200", + "math500:200", + "ceval:200", + ], + help=f"The list of benchmarks to run. The format is ::,. We support the following benchmarks: {', '.join(BENCHMARKS.benchmarks.keys())}", + ) + benchmark_group.add_argument( + "--enable-multi-turn-conversation", + action="store_true", + default=False, + ) + return parser.parse_args() + + +def launch_sglang_server( + server_args: ServerArgs, + base_url: str, + batch_size: int, + steps: int, + topk: int, + num_draft_tokens: int, + timeout: int, +): + """ + This function launches the SGLang server with the given server arguments. + """ + sglang_args: List[str] = [] + if steps > 0: + sglang_args.extend( + [ + "--speculative-algorithm", + "EAGLE3", + "--speculative-num-steps", + str(steps), + "--speculative-eagle-topk", + str(topk), + "--speculative-num-draft-tokens", + str(num_draft_tokens), + "--speculative-draft-model-path", + server_args.speculative_draft_model_path, + ] + ) + + sglang_args.extend( + [ + "--cuda-graph-max-bs", + str(batch_size), + "--mem-fraction-static", + str(server_args.mem_fraction_static), + "--tp-size", + str(server_args.tp_size), + "--max-running-requests", + str(batch_size), + ] + ) + + if server_args.trust_remote_code: + sglang_args.extend(["--trust-remote-code"]) + + if server_args.disable_radix_cache: + sglang_args.extend(["--disable-radix-cache"]) + + if server_args.ep_size: + sglang_args.extend(["--ep-size", str(server_args.ep_size)]) + + if server_args.attention_backend: + sglang_args.extend(["--attention-backend", server_args.attention_backend]) + + if server_args.quantization: + sglang_args.extend(["--quantization", server_args.quantization]) + + if server_args.dtype: + sglang_args.extend(["--dtype", server_args.dtype]) + + process = popen_launch_server( + server_args.model_path, + base_url, + timeout=timeout, + other_args=sglang_args, + env={ + "SGLANG_RECORD_STEP_TIME": "1", + "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN": "1", + **os.environ, + }, + ) + return process + + +def send_flush_cache_request(base_url: str): + requests.post(base_url + "/flush_cache") + + +def main(): + args = parse_args() + server_args: ServerArgs = ServerArgs.from_cli_args(args) + configs = [tuple(map(int, config.split(","))) for config in args.config_list] + + # split the arg into list of (bench_name, num_prompts) + benchmark_list = [] + for item in args.benchmark_list: + splits = item.split(":") + if len(splits) == 1: + bench_name = splits[0] + num_prompts = None + subset = None + elif len(splits) == 2: + bench_name, num_prompts = splits + subset = None + elif len(splits) == 3: + bench_name, num_prompts, subset = splits + subset = subset.split(",") + else: + raise ValueError(f"Invalid benchmark list format: {item}") + benchmark_list.append((bench_name, num_prompts, subset)) + assert len(benchmark_list) != 0, "the number of benchmark list is 0" + + base_url = f"http://localhost:{args.port}" + + results = {} + results["model"] = server_args.speculative_draft_model_path + + def run_benchmarks(batch_size: int, steps: int, topk: int, num_draft_tokens: int): + for benchmark_name, num_prompts, subset in benchmark_list: + print( + f"Running benchmark {benchmark_name} with {num_prompts} prompts, batch size {batch_size}, steps {steps}, topk {topk}, num_draft_tokens {num_draft_tokens}, subset {subset}" + ) + benchmarkder_cls = BENCHMARKS.get(benchmark_name) + num_prompts = int(num_prompts) if num_prompts is not None else None + if subset is None: + benchmarker = benchmarkder_cls(num_samples=num_prompts) + else: + benchmarker = benchmarkder_cls(num_samples=num_prompts, subset=subset) + metrics_list = benchmarker.run( + host=args.host, port=args.port, batch_size=batch_size + ) + send_flush_cache_request(base_url) + if benchmark_name not in results: + results[benchmark_name] = [] + results[benchmark_name].append( + dict( + batch_size=batch_size, + steps=steps, + topk=topk, + num_draft_tokens=num_draft_tokens, + metrics=[asdict(metric) for metric in metrics_list], + num_samples=num_prompts, + ) + ) + + if args.skip_launch_server: + batch_size = configs[0][0] if len(configs) > 0 else 8 + run_benchmarks(batch_size, None, None, None) + else: + # we itearate over each config from args + for batch_size, steps, topk, num_draft_tokens in configs: + process = launch_sglang_server( + server_args, + base_url, + batch_size, + steps, + topk, + num_draft_tokens, + args.timeout_for_server_launch, + ) + wait_for_server(base_url) + run_benchmarks(batch_size, steps, topk, num_draft_tokens) + kill_process_tree(process.pid) + process.wait() + + os.makedirs(args.output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d_%H%M%S") + result_file = os.path.join( + args.output_dir, + f"{args.name + '_' if args.name else ''}results_{timestamp}.jsonl", + ) + with open(result_file, "w") as f: + json.dump(results, f, indent=4) + print(f"Results saved to {result_file}") + + +if __name__ == "__main__": + main() diff --git a/idea1/benchmarks/benchmarker/__init__.py b/idea1/benchmarks/benchmarker/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8e37fb99c2caa75be8ab58dc51f393f9a748c2b7 --- /dev/null +++ b/idea1/benchmarks/benchmarker/__init__.py @@ -0,0 +1,29 @@ +from .aime import AIMEBenchmarker +from .ceval import CEvalBenchmarker +from .financeqa import FinanceQABenchmarker +from .gpqa import GPQABenchmarker +from .gsm8k import GSM8KBenchmarker +from .humaneval import HumanEvalBenchmarker +from .livecodebench import LCBBenchmarker +from .math500 import Math500Benchmarker +from .mmlu import MMLUBenchmarker +from .mmstar import MMStarBenchmarker +from .mtbench import MTBenchBenchmarker +from .registry import BENCHMARKS +from .simpleqa import SimpleQABenchmarker + +__all__ = [ + "BENCHMARKS", + "AIMEBenchmarker", + "CEvalBenchmarker", + "GSM8KBenchmarker", + "HumanEvalBenchmarker", + "Math500Benchmarker", + "MTBenchBenchmarker", + "MMStarBenchmarker", + "GPQABenchmarker", + "FinanceQABenchmarker", + "MMLUBenchmarker", + "LCBBenchmarker", + "SimpleQABenchmarker", +] diff --git a/idea1/benchmarks/benchmarker/aime.py b/idea1/benchmarks/benchmarker/aime.py new file mode 100644 index 0000000000000000000000000000000000000000..fba473c2c6d2f27397f67b424423271288cd6ae7 --- /dev/null +++ b/idea1/benchmarks/benchmarker/aime.py @@ -0,0 +1,133 @@ +""" +AIME benchmark +""" + +import re +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + + +def extract_aime_answer(output: str) -> Optional[str]: + """Extract final answer from AIME problem solution. + + AIME answers are typically integers between 0 and 999, and are usually + in \boxed{} format. + """ + # Try to find answer in \boxed{} format + boxed_pattern = r"\\boxed\{([^}]+)\}" + match = re.search(boxed_pattern, output) + if match: + answer = match.group(1).strip() + # Extract number from the boxed content + numbers = re.findall(r"\d+", answer) + if numbers: + return numbers[-1] # Take the last number (usually the final answer) + return answer + + # Try to find answer in \boxed format (without braces) + boxed_pattern2 = r"\\boxed\s+(\d+)" + match = re.search(boxed_pattern2, output) + if match: + return match.group(1).strip() + + # Look for patterns like "The answer is 42" or "Answer: 123" + answer_patterns = [ + r"(?:answer|Answer|ANSWER)[\s:]+(\d+)", + r"(?:final\s+answer|Final\s+Answer)[\s:]+(\d+)", + r"(?:is|equals?|=\s*)(\d+)\s*$", + ] + for pattern in answer_patterns: + matches = re.findall(pattern, output, re.IGNORECASE) + if matches: + return matches[-1].strip() + + # Fallback: extract the last integer in the text + numbers = re.findall(r"\b(\d+)\b", output) + if numbers: + # Filter to reasonable AIME answer range (0-999) + valid_numbers = [n for n in numbers if 0 <= int(n) <= 999] + if valid_numbers: + return valid_numbers[-1] + + return None + + +@BENCHMARKS.register("aime") +class AIMEBenchmarker(Benchmarker): + """AIME benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]: + """Load and preprocess AIME dataset.""" + dataset = load_dataset("Maxwell-Jia/AIME_2024")["train"] + questions = [] + labels = [] + for idx, q in enumerate(dataset): + if self.num_samples is not None and idx >= self.num_samples: + break + + questions.append({"question": q["Problem"]}) + # Extract answer from Answer field + answer = None + if "Answer" in q: + answer = str(q["Answer"]).strip() + elif "answer" in q: + answer = str(q["answer"]).strip() + labels.append(answer) + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]: + """Extract answer from model output.""" + return extract_aime_answer(output) + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + """Compute accuracy for AIME by comparing numeric answers.""" + if not labels or len(labels) == 0: + return None + if all(label is None for label in labels): + return None + + correct = 0 + valid_count = 0 + for pred, label in zip(predictions, labels): + if label is not None: + valid_count += 1 + if pred is not None: + # Normalize answers for comparison + pred_normalized = str(pred).strip() + label_normalized = str(label).strip() + # Try exact match first + if pred_normalized == label_normalized: + correct += 1 + else: + # Try numeric comparison + try: + pred_num = int(pred_normalized) + label_num = int(label_normalized) + if pred_num == label_num: + correct += 1 + except ValueError: + pass + + return correct / valid_count if valid_count > 0 else 0.0 + + def create_sgl_function(self): + """Create SGL function for AIME with reasoning prompt.""" + return create_simple_sgl_function( + function_name="reasoning_gen", + answer_key="answer", + user_prefix="\nPlease reason step by step, and put your final answer within \\boxed{}.", + ) + + def get_max_new_tokens(self) -> int: + """AIME problems require more tokens.""" + return 32768 diff --git a/idea1/benchmarks/benchmarker/base.py b/idea1/benchmarks/benchmarker/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f8da625319cc854688521b8d9bf1a4b98ac5006b --- /dev/null +++ b/idea1/benchmarks/benchmarker/base.py @@ -0,0 +1,218 @@ +""" +Base class for benchmark implementations. +""" + +import time +from abc import ABC, abstractmethod +from argparse import Namespace +from typing import Any, Callable, Dict, List, Optional, Tuple + +from sglang import set_default_backend +from sglang.test.test_utils import select_sglang_backend + +from .utils import compute_metrics + + +class Benchmarker(ABC): + """ + Base class for benchmark implementations. + + Subclasses should implement: + - load_data(): Load and preprocess dataset + - create_sgl_function(): Create the SGL function for inference + + Optional overrides: + - extract_answer(): Extract answer from model output (if needed) + - compute_accuracy(): Compute accuracy metric (if applicable) + - get_answer_keys(): Get list of answer keys for multi-turn conversations + + Args: + num_samples: The number of samples to run the benchmark on. If not provided, all questions will be used. + subset: The subset of the dataset to run the benchmark on. If not provided, all subsets will be used. + """ + + def __init__( + self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None + ): + self.num_samples = num_samples + self.subset = subset + + @abstractmethod + def load_data(self) -> Tuple[List[Dict[str, Any]], List[Any]]: + """ + Load and preprocess the dataset. + + Returns: + Tuple of (questions, labels) where: + - questions: List of question dicts for SGL function + - labels: List of ground truth labels (can be None if not applicable) + """ + raise NotImplementedError + + @abstractmethod + def create_sgl_function(self) -> Callable: + """ + Create the SGL function for inference. + + Returns: + SGL function decorated with @sgl.function + """ + raise NotImplementedError + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[Any]: + """ + Extract answer from model output. + + Args: + output: Raw model output string + label: Optional ground truth label for reference + + Returns: + Extracted answer, or None if extraction fails + """ + return output + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + """ + Compute accuracy metric. + + Args: + predictions: List of predicted answers + labels: List of ground truth labels + + Returns: + Accuracy score (0-1), or None if not applicable + """ + return None + + def get_answer_keys(self) -> Optional[List[str]]: + """ + Get list of answer keys for multi-turn conversations. + + Returns: + List of answer keys (e.g., ["answer_1", "answer_2"]), or None for single-turn + """ + return None + + def get_max_new_tokens(self) -> int: + """ + Get maximum number of new tokens to generate. + + Returns: + Maximum tokens (default: 2048) + """ + return 2048 + + def run( + self, + host: str, + port: int, + batch_size: int, + max_new_tokens: int = None, + num_runs: int = 1, + ): + """ + Run the benchmark evaluation. + + This method handles the common workflow: + 1. Initialize backend + 2. Load data + 3. Create SGL function + 4. Run inference loops + 5. Compute metrics + 6. Print results + + Args: + host (str): The host of the SGLang server + port (int): The port of the SGLang server + batch_size (int): The number of prompts to process in parallel + num_samples (int): The number of samples to run the benchmark on. If not provided, all samples will be used. + max_new_tokens (int): Maximum number of new tokens to generate, default is 2048 + num_runs (int): The number of times to run this benchmark, default is 1. You can set it to a larger number if you want to get more stable results. + """ + if not host.startswith(("http://", "https://")): + host = f"http://{host}" + # Initialize backend + sglang_args = Namespace(host=host, port=port, backend="srt-no-parallel") + set_default_backend(select_sglang_backend(sglang_args)) + + # Load data + questions, labels = self.load_data() + if len(questions) == 0: + print("No valid questions found. Please check the dataset format.") + return + + # Create SGL function + sgl_function = self.create_sgl_function() + + # Run evaluation loops + metrics_list = [] + answer_keys = self.get_answer_keys() + max_new_tokens = max_new_tokens or self.get_max_new_tokens() + + for _ in range(num_runs): + tic = time.perf_counter() + states = sgl_function.run_batch( + questions, + temperature=0, + max_new_tokens=max_new_tokens, + num_threads=batch_size, + progress_bar=True, + ) + latency = time.perf_counter() - tic + + # Extract predictions + predictions = [] + primary_answer_key = answer_keys[0] if answer_keys else "answer" + for i in range(len(states)): + # Access answer from state object (states[i] supports dict-like access) + output = states[i][primary_answer_key] + if isinstance(output, str): + extracted = self.extract_answer( + output, + (labels[i] if labels and i < len(labels) else None), + ) + else: + extracted = output + predictions.append(extracted) + + # Compute accuracy if applicable + accuracy = None + # Check if we have a labels list (even if all labels are None) + has_labels_list = labels and len(labels) > 0 + + if has_labels_list: + # Always call compute_accuracy if we have a labels list + # This allows it to return None, which will be displayed in print_results + accuracy = self.compute_accuracy(predictions, labels) + if accuracy is not None: + valid_count = sum(1 for p in predictions if p is not None) + if valid_count < len(predictions): + print( + f"Warning: {len(predictions) - valid_count} predictions could not be extracted." + ) + + # Compute performance metrics + metrics = compute_metrics( + states, + latency, + answer_key=primary_answer_key, + additional_answer_keys=( + answer_keys[1:] if answer_keys and len(answer_keys) > 1 else None + ), + ) + # Always set accuracy if we have a labels list (even if compute_accuracy returns None) + # This allows print_results to show None when compute_accuracy returns None + if has_labels_list: + metrics.accuracy = ( + accuracy # Can be None if compute_accuracy returns None + ) + if accuracy is not None: + metrics.num_valid_predictions = sum( + 1 for p in predictions if p is not None + ) + + metrics_list.append(metrics) + return metrics_list diff --git a/idea1/benchmarks/benchmarker/ceval.py b/idea1/benchmarks/benchmarker/ceval.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b77ccbdb0deb5ce4d2c4522a157836cf0e6efb --- /dev/null +++ b/idea1/benchmarks/benchmarker/ceval.py @@ -0,0 +1,267 @@ +""" +C-Eval benchmark evaluation script. +""" + +import re +from typing import Any, Dict, List, Optional, Tuple + +from datasets import concatenate_datasets, load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + + +def extract_answer(answer_str: str) -> str: + """Extract the answer choice (A, B, C, D) from the model output.""" + # Try to find the answer in various formats + answer_str = answer_str.strip().upper() + + # Direct match for single letter + match = re.search(r"\b([ABCD])\b", answer_str) + if match: + return match.group(1) + + # Try to find answer in parentheses or brackets + for pattern in [ + r"\(([ABCD])\)", + r"\[([ABCD])\]", + r"答案[::]\s*([ABCD])", + r"Answer[::]\s*([ABCD])", + ]: + match = re.search(pattern, answer_str, re.IGNORECASE) + if match: + return match.group(1).upper() + + # Try to find the first occurrence of A, B, C, or D + match = re.search(r"([ABCD])", answer_str) + if match: + return match.group(1) + + return None + + +def format_question(question: str, options: List[str]) -> str: + """Format the question with options.""" + prompt = question + "\n\n选项:\n" + for i, option in enumerate(options): + prompt += f"{chr(65 + i)}. {option}\n" + prompt += "\n请从A、B、C、D中选择一个答案。" + return prompt + + +@BENCHMARKS.register("ceval") +class CEvalBenchmarker(Benchmarker): + """C-Eval benchmark implementation.""" + + def __init__( + self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None + ): + if subset is None: + subset = "all" + super().__init__(num_samples, subset) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[str]]: + """Load and preprocess C-Eval dataset.""" + all_configs = [ + "accountant", + "advanced_mathematics", + "art_studies", + "basic_medicine", + "business_administration", + "chinese_language_and_literature", + "civil_servant", + "clinical_medicine", + "college_chemistry", + "college_economics", + "college_physics", + "college_programming", + "computer_architecture", + "computer_network", + "discrete_mathematics", + "education_science", + "electrical_engineer", + "environmental_impact_assessment_engineer", + "fire_engineer", + "high_school_biology", + "high_school_chemistry", + "high_school_chinese", + "high_school_geography", + "high_school_history", + "high_school_mathematics", + "high_school_physics", + "high_school_politics", + "ideological_and_moral_cultivation", + "law", + "legal_professional", + "logic", + "mao_zedong_thought", + "marxism", + "metrology_engineer", + "middle_school_biology", + "middle_school_chemistry", + "middle_school_geography", + "middle_school_history", + "middle_school_mathematics", + "middle_school_physics", + "middle_school_politics", + "modern_chinese_history", + "operating_system", + "physician", + "plant_protection", + "probability_and_statistics", + "professional_tour_guide", + "sports_science", + "tax_accountant", + "teacher_qualification", + "urban_and_rural_planner", + "veterinary_medicine", + ] + + # Select configs to load + if self.subset == "all": + configs_to_load = all_configs + else: + for subset in self.subset: + assert ( + subset in all_configs + ), f"Subset {subset} not found in C-Eval dataset" + configs_to_load = self.subset + + # Load datasets + try: + datasets = [] + for config in configs_to_load: + try: + ds = load_dataset("ceval/ceval-exam", name=config, split="test") + datasets.append(ds) + print(f"Loaded config '{config}' with {len(ds)} samples") + except Exception as e: + print(f"Warning: Failed to load config '{config}': {e}") + if len(datasets) == 0: + raise ValueError("No configs could be loaded") + dataset = concatenate_datasets(datasets) + print( + f"Successfully loaded C-Eval dataset with all configs (total: {len(dataset)} samples)" + ) + except Exception as e: + print(e) + print(f"Failed to load C-Eval dataset from 'ceval/ceval-exam': {e}") + print("Please ensure the dataset is available or install it manually.") + print("You can try: pip install datasets") + print("Or download from: https://huggingface.co/datasets/ceval/ceval-exam") + return [], [] + + # Process questions + questions = [] + labels = [] + for idx, item in enumerate(dataset): + if self.num_samples is not None and idx >= self.num_samples: + break + + # Handle different dataset formats + question_text = None + if "question" in item: + question_text = item["question"] + elif "inputs" in item: + question_text = item["inputs"] + elif "problem" in item: + question_text = item["problem"] + elif "content" in item: + question_text = item["content"] + + if not question_text: + continue + + # Get options - C-Eval typically has options as a list or dict + options = None + if "options" in item: + options = item["options"] + if isinstance(options, dict): + # Convert dict to list in order A, B, C, D + options = [ + options.get("A", ""), + options.get("B", ""), + options.get("C", ""), + options.get("D", ""), + ] + elif isinstance(options, list): + # Ensure we have 4 options + while len(options) < 4: + options.append("") + elif "choices" in item: + options = item["choices"] + if isinstance(options, dict): + options = [ + options.get("A", ""), + options.get("B", ""), + options.get("C", ""), + options.get("D", ""), + ] + else: + # Try to construct options from A, B, C, D fields + options = [ + item.get("A", item.get("option_A", "")), + item.get("B", item.get("option_B", "")), + item.get("C", item.get("option_C", "")), + item.get("D", item.get("option_D", "")), + ] + + # Filter out empty options + if options: + options = [str(opt).strip() for opt in options if opt] + if len(options) < 2: # Need at least 2 options + continue + else: + continue + + # Get answer + answer = None + if "answer" in item: + answer = str(item["answer"]).upper().strip() + elif "target" in item: + answer = str(item["target"]).upper().strip() + elif "label" in item: + answer = str(item["label"]).upper().strip() + elif "correct" in item: + answer = str(item["correct"]).upper().strip() + + # Validate answer + if answer and answer in ["A", "B", "C", "D"]: + # Format question + formatted_question = format_question(question_text, options) + questions.append({"question": formatted_question}) + labels.append(answer) + + if len(questions) == 0: + print("No valid questions found. Please check the dataset format.") + print( + "Sample item keys:", + list(dataset[0].keys()) if len(dataset) > 0 else "No items", + ) + return [], [] + + return questions, labels + + def create_sgl_function(self): + """Create SGL function for C-Eval.""" + return create_simple_sgl_function( + function_name="get_ceval_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) + + def extract_answer(self, output: str, label: Any = None) -> str: + """Extract answer choice from model output.""" + return extract_answer(output) + + def compute_accuracy(self, predictions: List[str], labels: List[str]) -> float: + """Compute accuracy metric.""" + correct = 0 + valid_count = 0 + for i in range(len(predictions)): + if predictions[i] is not None: # Only count valid predictions + valid_count += 1 + if predictions[i] == labels[i]: + correct += 1 + return correct / valid_count if valid_count > 0 else 0.0 diff --git a/idea1/benchmarks/benchmarker/financeqa.py b/idea1/benchmarks/benchmarker/financeqa.py new file mode 100644 index 0000000000000000000000000000000000000000..9323b63423ba288edc79d2ecfb6a33d0a926af7c --- /dev/null +++ b/idea1/benchmarks/benchmarker/financeqa.py @@ -0,0 +1,59 @@ +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + +QUESTION_PROMPT = """ +Given the following context: + +{context} + +Can you answer the following question? + +{question} +""".strip() + + +def generate_question(row: Dict[str, Any]) -> str: + if row["context"] is None: + return row["question"].strip() + else: + question = QUESTION_PROMPT.format( + context=row["context"].strip(), + question=row["question"].strip(), + ) + return question + + +@BENCHMARKS.register("financeqa") +class FinanceQABenchmarker(Benchmarker): + """FinanceQA benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]: + """Load and preprocess FinanceQA dataset.""" + # Read data + ds = load_dataset("AfterQuery/FinanceQA")["test"] + + questions = [] + labels = [] + for i in range((len(ds))): + if self.num_samples is not None and i >= self.num_samples: + break + + question_text = generate_question(ds[i]) + questions.append({"question": question_text}) + labels.append(None) + return questions, labels + + def create_sgl_function(self): + return create_simple_sgl_function( + function_name="get_financeqa_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) diff --git a/idea1/benchmarks/benchmarker/gpqa.py b/idea1/benchmarks/benchmarker/gpqa.py new file mode 100644 index 0000000000000000000000000000000000000000..e2add8fa835a076e51be350c9d95295e0f20bb31 --- /dev/null +++ b/idea1/benchmarks/benchmarker/gpqa.py @@ -0,0 +1,85 @@ +import random +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + +GPQA_QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. + +{Question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +def generate_question(row: Dict[str, Any]) -> str: + gold_index = random.randint(0, 3) + choices = [ + row["Incorrect Answer 1"], + row["Incorrect Answer 2"], + row["Incorrect Answer 3"], + ] + choices.insert(gold_index, row["Correct Answer"]) + + question = GPQA_QUERY_TEMPLATE.format( + Question=row["Question"].strip(), + A=choices[0].strip(), + B=choices[1].strip(), + C=choices[2].strip(), + D=choices[3].strip(), + ) + + # 0 means A, 1 means B, 2 means C, 3 means D + answer = ["A", "B", "C", "D"][gold_index] + return question, answer + + +@BENCHMARKS.register("gpqa") +class GPQABenchmarker(Benchmarker): + """GPQA benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]: + """Load and preprocess GPQA dataset.""" + # Read data + ds = load_dataset("Idavidrein/gpqa", "gpqa_main")["train"] + + questions = [] + labels = [] + for i in range((len(ds))): + if self.num_samples is not None and i >= self.num_samples: + break + + question_text, answer = generate_question(ds[i]) + questions.append({"question": question_text}) + labels.append(answer) + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]: + if "Answer: " not in output: + return None + return output.split("Answer: ")[1].strip() + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + if not labels or len(labels) == 0: + return None + correct = sum(1 for pred, label in zip(predictions, labels) if pred == label) + return correct / len(labels) if len(labels) > 0 else 0.0 + + def create_sgl_function(self): + return create_simple_sgl_function( + function_name="get_gpqa_mcq_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) diff --git a/idea1/benchmarks/benchmarker/gsm8k.py b/idea1/benchmarks/benchmarker/gsm8k.py new file mode 100644 index 0000000000000000000000000000000000000000..10f8dbae82381cb1cc1a9e7ade454ea58a9da6c7 --- /dev/null +++ b/idea1/benchmarks/benchmarker/gsm8k.py @@ -0,0 +1,99 @@ +""" +GSM8K benchmark evaluation script. +""" + +import ast +import re +from typing import Any, Dict, List, Optional, Tuple + +from sglang.utils import download_and_cache_file, read_jsonl + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_few_shot_sgl_function + +INVALID = -9999999 + + +def get_one_example(lines: List[Dict], i: int, include_answer: bool) -> str: + """Format a single example.""" + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines: List[Dict], k: int) -> str: + """Get few-shot examples as a string.""" + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str: str) -> int: + """Extract numeric answer from model output.""" + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +@BENCHMARKS.register("gsm8k") +class GSM8KBenchmarker(Benchmarker): + """GSM8K benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]: + """Load and preprocess GSM8K dataset.""" + # Read data + url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl" + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) + + # Construct prompts + few_shot_examples = get_few_shot_examples(lines, 5) + + questions = [] + labels = [] + for i in range((len(lines))): + if self.num_samples is not None and i >= self.num_samples: + break + + question_text = get_one_example(lines, i, False) + questions.append({"question": question_text}) + labels.append(get_answer_value(lines[i]["answer"])) + + # Store few_shot_examples for use in create_sgl_function + self.few_shot_examples = few_shot_examples + + assert all(l != INVALID for l in labels), "Some labels are invalid" + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]: + """Extract numeric answer from model output.""" + return get_answer_value(output) + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + """Compute accuracy for GSM8K by comparing numeric answers.""" + if not labels or len(labels) == 0: + return None + correct = sum(1 for pred, label in zip(predictions, labels) if pred == label) + return correct / len(labels) if len(labels) > 0 else 0.0 + + def create_sgl_function(self): + """Create SGL function for GSM8K with few-shot examples.""" + return create_few_shot_sgl_function( + few_shot_examples=self.few_shot_examples, + function_name="few_shot_gsm8k", + answer_key="answer", + stop=["Question", "Assistant:", "<|separator|>"], + ) diff --git a/idea1/benchmarks/benchmarker/humaneval.py b/idea1/benchmarks/benchmarker/humaneval.py new file mode 100644 index 0000000000000000000000000000000000000000..6be1bdec5f6f421ff5787c02d0b0ace9375ebb0f --- /dev/null +++ b/idea1/benchmarks/benchmarker/humaneval.py @@ -0,0 +1,188 @@ +""" +HumanEval benchmark evaluation script. +""" + +import re +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + + +def extract_code_from_output(output: str) -> Optional[str]: + """Extract Python code from model output. + + Tries to extract code blocks or function definitions. + """ + # Try to find code in markdown code blocks + code_block_pattern = r"```(?:python)?\n(.*?)```" + match = re.search(code_block_pattern, output, re.DOTALL) + if match: + return match.group(1).strip() + + # Try to find function definition (common in HumanEval) + # Look for "def " followed by code until the next def or end of string + def_pattern = r"(def\s+\w+\([^)]*\):.*?)(?=\n\ndef\s+|\Z)" + match = re.search(def_pattern, output, re.DOTALL) + if match: + return match.group(1).strip() + + # Fallback: return the output as-is (might already be code) + return output.strip() if output.strip() else None + + +def check_code_passes_tests(code: str, test_code: str, entry_point: str) -> bool: + """Check if generated code passes the test cases. + + This is a simplified version. For full evaluation, use the official + HumanEval evaluation framework. + + HumanEval test code typically contains assertions that will raise + AssertionError if the code doesn't pass. If execution completes without + exceptions, the tests pass. + """ + try: + # Create a safe execution environment + namespace = {} + # Execute the code (function definition) + exec(code, namespace) + # Execute the test code (which contains assertions) + # If no exception is raised, the tests pass + exec(test_code, namespace) + return True + except AssertionError: + # Assertion failed - test didn't pass + return False + except Exception: + # Any other exception (syntax error, runtime error, etc.) means test failed + return False + + +@BENCHMARKS.register("humaneval") +class HumanEvalBenchmarker(Benchmarker): + """HumanEval benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + """Initialize benchmark and store test cases.""" + super().__init__(num_samples, None) + self.test_cases = [] + self.entry_points = [] + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[Dict[str, str]]]]: + """Load and preprocess HumanEval dataset.""" + dataset = load_dataset("openai/openai_humaneval")["test"] + questions = [] + labels = [] + self.test_cases = [] + self.entry_points = [] + + for idx, q in enumerate(dataset): + if self.num_samples is not None and idx >= self.num_samples: + break + + questions.append({"question": q["prompt"]}) + + # Store test case and entry point for evaluation + test_code = q.get("test", "") + entry_point = q.get("entry_point", "") + self.test_cases.append(test_code) + self.entry_points.append(entry_point) + + # Store canonical solution as reference (optional, for comparison) + canonical_solution = q.get("canonical_solution", "") + labels.append( + { + "test": test_code, + "entry_point": entry_point, + "canonical_solution": canonical_solution, + } + ) + + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]: + """Extract code from model output.""" + return extract_code_from_output(output) + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + """Compute accuracy for HumanEval by checking if code passes tests. + + Note: This is a simplified evaluation. For official pass@k metrics, + use the HumanEval evaluation framework. + """ + if not labels or len(labels) == 0: + return None + if all(label is None for label in labels): + return None + + correct = 0 + valid_count = 0 + + for i, (pred, label) in enumerate(zip(predictions, labels)): + if label is not None and isinstance(label, dict): + valid_count += 1 + if pred is not None: + try: + # Get the prompt (function signature and docstring) + prompt = self.questions[i]["question"] + entry_point = label.get("entry_point", "") + + # The prompt contains the function signature (e.g., "def function_name(...):") + # The generated code might be: + # 1. Just the function body (what we want) - need to combine with prompt + # 2. The complete function including signature - use as-is + # 3. Code in markdown blocks - already extracted by extract_code_from_output + + pred_str = str(pred).strip() + + # Check if pred already contains a complete function definition + # (starts with "def " and contains the entry_point function name) + if pred_str.startswith("def ") and entry_point: + # Check if this is the same function (by name) + func_name_match = re.match(r"def\s+(\w+)\s*\(", pred_str) + if ( + func_name_match + and func_name_match.group(1) == entry_point + ): + # Generated code includes complete function, use it as-is + full_code = pred_str + else: + # Different function or no match, combine with prompt + full_code = prompt + "\n" + pred_str + elif pred_str.startswith("def "): + # Has function definition but we can't verify entry_point, use as-is + full_code = pred_str + else: + # Generated code is just the body, combine with prompt + full_code = prompt + "\n" + pred_str + + # Check if code passes tests + test_code = label.get("test", "") + + if test_code and check_code_passes_tests( + full_code, test_code, entry_point + ): + correct += 1 + except Exception as e: + # If evaluation fails, consider it incorrect + # Uncomment for debugging: print(f"Error evaluating code {i}: {e}") + pass + + return correct / valid_count if valid_count > 0 else 0.0 + + def create_sgl_function(self): + """Create SGL function for HumanEval.""" + return create_simple_sgl_function( + function_name="get_humaneval_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) + + def get_max_new_tokens(self) -> int: + """HumanEval code generation requires more tokens.""" + return 1024 diff --git a/idea1/benchmarks/benchmarker/livecodebench.py b/idea1/benchmarks/benchmarker/livecodebench.py new file mode 100644 index 0000000000000000000000000000000000000000..490ba2b20349ecd68a3edc468d38ef377c6e8d05 --- /dev/null +++ b/idea1/benchmarks/benchmarker/livecodebench.py @@ -0,0 +1,46 @@ +""" +GSM8K benchmark evaluation script. +""" + +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + + +def generate_question(row: Dict[str, Any]) -> str: + question = row["question_content"].strip() + return question + + +@BENCHMARKS.register("livecodebench") +class LCBBenchmarker(Benchmarker): + """LiveCodeBench benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]: + # Read data + ds = load_dataset("livecodebench/code_generation")["test"] + + questions = [] + labels = [] + for i in range((len(ds))): + if self.num_samples is not None and i >= self.num_samples: + break + + question_text = generate_question(ds[i]) + questions.append({"question": question_text}) + labels.append(None) + return questions, labels + + def create_sgl_function(self): + return create_simple_sgl_function( + function_name="get_livecodebench_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) diff --git a/idea1/benchmarks/benchmarker/math500.py b/idea1/benchmarks/benchmarker/math500.py new file mode 100644 index 0000000000000000000000000000000000000000..64ca48eb386aa6f388ef997c34de496dad4db1b7 --- /dev/null +++ b/idea1/benchmarks/benchmarker/math500.py @@ -0,0 +1,122 @@ +""" +MATH-500 benchmark evaluation script. +""" + +import re +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + + +def extract_math_answer(output: str) -> Optional[str]: + """Extract final answer from math problem solution. + + Tries to extract answer from \boxed{} format first, then looks for + the last number in the output. + """ + # Try to find answer in \boxed{} format + boxed_pattern = r"\\boxed\{([^}]+)\}" + match = re.search(boxed_pattern, output) + if match: + return match.group(1).strip() + + # Try to find answer in \boxed format (without braces) + boxed_pattern2 = r"\\boxed\s+([^\s]+)" + match = re.search(boxed_pattern2, output) + if match: + return match.group(1).strip() + + # Try to find the last number (could be integer or decimal) + # Look for patterns like "The answer is 42" or "Answer: 3.14" + answer_patterns = [ + r"(?:answer|Answer|ANSWER)[\s:]+([-+]?\d*\.?\d+)", + r"(?:is|equals?|=\s*)([-+]?\d*\.?\d+)\s*$", + ] + for pattern in answer_patterns: + matches = re.findall(pattern, output, re.IGNORECASE) + if matches: + return matches[-1].strip() + + # Fallback: extract the last number in the text + numbers = re.findall(r"[-+]?\d*\.?\d+", output) + if numbers: + return numbers[-1] + + return None + + +@BENCHMARKS.register("math500") +class Math500Benchmarker(Benchmarker): + """MATH-500 benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]: + """Load and preprocess MATH-500 dataset.""" + dataset = load_dataset("HuggingFaceH4/MATH-500")["test"] + questions = [] + labels = [] + for idx, q in enumerate(dataset): + if self.num_samples is not None and idx >= self.num_samples: + break + + questions.append({"question": q["problem"]}) + # Extract answer from solution or answer field + answer = None + if "answer" in q: + answer = str(q["answer"]).strip() + elif "solution" in q: + # Try to extract from solution + answer = extract_math_answer(q["solution"]) + labels.append(answer) + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]: + """Extract answer from model output.""" + return extract_math_answer(output) + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + """Compute accuracy for MATH-500 by comparing answers.""" + if not labels or len(labels) == 0: + return None + if all(label is None for label in labels): + return None + + correct = 0 + valid_count = 0 + for pred, label in zip(predictions, labels): + if label is not None: + valid_count += 1 + if pred is not None: + # Normalize answers for comparison (remove whitespace, handle different formats) + pred_normalized = str(pred).strip().lower() + label_normalized = str(label).strip().lower() + # Try exact match first + if pred_normalized == label_normalized: + correct += 1 + else: + # Try numeric comparison if both are numbers + try: + pred_num = float(pred_normalized) + label_num = float(label_normalized) + if abs(pred_num - label_num) < 1e-6: + correct += 1 + except ValueError: + pass + + return correct / valid_count if valid_count > 0 else 0.0 + + def create_sgl_function(self): + """Create SGL function for MATH-500.""" + return create_simple_sgl_function( + function_name="get_math500_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) diff --git a/idea1/benchmarks/benchmarker/mmlu.py b/idea1/benchmarks/benchmarker/mmlu.py new file mode 100644 index 0000000000000000000000000000000000000000..407339a82e2f1d86d8829a33ededb2201f3b2ee2 --- /dev/null +++ b/idea1/benchmarks/benchmarker/mmlu.py @@ -0,0 +1,82 @@ +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + +GPQA_QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. + +{Question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +def generate_question(row: Dict[str, Any]) -> str: + choices = row["choices"] + question = GPQA_QUERY_TEMPLATE.format( + Question=row["question"].strip(), + A=choices[0].strip(), + B=choices[1].strip(), + C=choices[2].strip(), + D=choices[3].strip(), + ) + + # 0 means A, 1 means B, 2 means C, 3 means D + answer = ["A", "B", "C", "D"][row["answer"]] + print(answer) + return question, answer + + +@BENCHMARKS.register("mmlu") +class MMLUBenchmarker(Benchmarker): + """MMLU benchmark implementation.""" + + def __init__( + self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None + ): + if subset is None: + subset = ["all"] + super().__init__(num_samples, subset) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]: + # Read data + questions = [] + labels = [] + + for subset in self.subset: + ds = load_dataset("cais/mmlu", subset)["test"] + for i in range((len(ds))): + if self.num_samples is not None and i >= self.num_samples: + break + + question_text, answer = generate_question(ds[i]) + questions.append({"question": question_text}) + labels.append(answer) + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]: + if "Answer: " not in output: + return None + return output.split("Answer: ")[1].strip() + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + if not labels or len(labels) == 0: + return None + correct = sum(1 for pred, label in zip(predictions, labels) if pred == label) + return correct / len(labels) if len(labels) > 0 else 0.0 + + def create_sgl_function(self): + return create_simple_sgl_function( + function_name="get_mmlu_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) diff --git a/idea1/benchmarks/benchmarker/mmstar.py b/idea1/benchmarks/benchmarker/mmstar.py new file mode 100644 index 0000000000000000000000000000000000000000..9ab1c44a28023a6bf18277edcacbe96794fa2c6a --- /dev/null +++ b/idea1/benchmarks/benchmarker/mmstar.py @@ -0,0 +1,185 @@ +""" +MMStar benchmark evaluation script. +""" + +import os +import re +import shutil +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_image_sgl_function + + +def extract_mmstar_answer( + output: str, options: Optional[List[str]] = None +) -> Optional[str]: + """Extract answer from MMStar model output. + + MMStar questions typically have multiple choice options (A, B, C, D, etc.) + """ + output_upper = output.strip().upper() + + # Try to find answer choice (A, B, C, D, etc.) + # Direct match for single letter + match = re.search(r"\b([A-Z])\b", output_upper) + if match: + letter = match.group(1) + if options and len(options) > 0: + # Validate that the letter is within valid range + max_option = chr(64 + len(options)) # 'A' + (len-1) + if "A" <= letter <= max_option: + return letter + else: + # Assume A-D are valid + if "A" <= letter <= "D": + return letter + + # Try to find answer in parentheses or brackets + for pattern in [ + r"\(([A-Z])\)", + r"\[([A-Z])\]", + r"答案[::]\s*([A-Z])", + r"Answer[::]\s*([A-Z])", + r"选择[::]\s*([A-Z])", + ]: + match = re.search(pattern, output_upper) + if match: + letter = match.group(1) + if options and len(options) > 0: + max_option = chr(64 + len(options)) + if "A" <= letter <= max_option: + return letter + elif "A" <= letter <= "D": + return letter + + return None + + +@BENCHMARKS.register("mmstar") +class MMStarBenchmarker(Benchmarker): + """MMStar benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + """Initialize benchmark and set up cache directory.""" + self.cache_dir = None + self.options_list = [] # Store options for each question + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]: + """Load and preprocess MMStar dataset.""" + self.cache_dir = os.path.join(".cache", "mmstar_specforge") + image_dir = os.path.join(self.cache_dir, "images") + os.makedirs(self.cache_dir, exist_ok=True) + os.makedirs(image_dir, exist_ok=True) + print(f"Created temporary image directory: {self.cache_dir}") + + dataset = load_dataset("Lin-Chen/MMStar")["val"] + questions = [] + labels = [] + self.options_list = [] + + for idx, q in enumerate(dataset): + if self.num_samples is not None and idx >= self.num_samples: + break + + image = q["image"] + image_path = os.path.join(self.cache_dir, q["meta_info"]["image_path"]) + image.convert("RGB").save(image_path, "JPEG") + + # Extract question and options + question_full = q["question"] + if "Options:" in question_full: + question_text, options_text = question_full.split("Options:", 1) + question_text = question_text.strip() + # Parse options (typically A. option1 B. option2 etc.) + options = [] + for line in options_text.strip().split("\n"): + line = line.strip() + if line and re.match(r"^[A-Z]\.", line): + option_text = re.sub(r"^[A-Z]\.\s*", "", line).strip() + options.append(option_text) + self.options_list.append(options) + else: + question_text = question_full.strip() + self.options_list.append([]) + + item = { + "image_path": image_path, + "question": question_text, + } + questions.append(item) + + # Extract ground truth answer + answer = None + if "answer" in q: + answer = str(q["answer"]).strip().upper() + elif "correct_answer" in q: + answer = str(q["correct_answer"]).strip().upper() + elif "ground_truth" in q: + answer = str(q["ground_truth"]).strip().upper() + + # Validate answer is a valid option letter + if answer and len(answer) == 1 and "A" <= answer <= "Z": + if self.options_list[-1]: + max_option = chr(64 + len(self.options_list[-1])) + if answer <= max_option: + labels.append(answer) + else: + labels.append(None) + else: + labels.append(answer) + else: + labels.append(None) + + return questions, labels + + def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]: + """Extract answer from model output.""" + # Use the options for the current question if available + # Note: We can't easily get the question index here, so we'll use a simpler approach + return extract_mmstar_answer(output) + + def compute_accuracy( + self, predictions: List[Any], labels: List[Any] + ) -> Optional[float]: + """Compute accuracy for MMStar by comparing answer choices.""" + if not labels or len(labels) == 0: + return None + if all(label is None for label in labels): + return None + + correct = 0 + valid_count = 0 + for pred, label in zip(predictions, labels): + if label is not None: + valid_count += 1 + if pred is not None: + # Normalize to uppercase for comparison + pred_normalized = str(pred).strip().upper() + label_normalized = str(label).strip().upper() + if pred_normalized == label_normalized: + correct += 1 + + return correct / valid_count if valid_count > 0 else 0.0 + + def create_sgl_function(self): + """Create SGL function for MMStar (image-based Q&A).""" + return create_image_sgl_function( + function_name="get_mmstar_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) + + def run(self, *args, **kwargs): + """Run benchmark and clean up cache directory.""" + try: + return super().run(*args, **kwargs) + finally: + # Clean up cache directory + if self.cache_dir and os.path.exists(self.cache_dir): + shutil.rmtree(self.cache_dir) + print(f"Deleted temporary directory: {self.cache_dir}") diff --git a/idea1/benchmarks/benchmarker/mtbench.py b/idea1/benchmarks/benchmarker/mtbench.py new file mode 100644 index 0000000000000000000000000000000000000000..46f2d1d611c8065219d65b221c1c15ae9409e21f --- /dev/null +++ b/idea1/benchmarks/benchmarker/mtbench.py @@ -0,0 +1,59 @@ +""" +MT-Bench benchmark evaluation script. +Adapted from https://github.com/chromecast56/sglang/blob/6f145d2eadb93a116134f703358ce76f15381045/benchmark/mtbench/bench_sglang.py +""" + +from typing import Any, Dict, List, Optional, Tuple + +from sglang.utils import download_and_cache_file, read_jsonl + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_multi_turn_sgl_function + +SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." + + +@BENCHMARKS.register("mtbench") +class MTBenchBenchmarker(Benchmarker): + """MT-Bench benchmark implementation.""" + + def __init__( + self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None + ): + # support categorical data for mtbench + if subset is None: + subset = ["all"] + super().__init__(num_samples, subset) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[None]]: + """Load and preprocess MT-Bench dataset.""" + url = "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl" + download_and_cache_file(url, filename="mtbench.jsonl") + questions_data = list(read_jsonl("mtbench.jsonl")) + questions_data = questions_data + + questions = [ + {"question_1": q["turns"][0], "question_2": q["turns"][1]} + for q in questions_data + ] + # MT-Bench doesn't have labels for accuracy computation + labels = [None] * len(questions) + + if self.num_samples is not None: + questions = questions[: self.num_samples] + labels = labels[: self.num_samples] + return questions, labels + + def create_sgl_function(self): + """Create SGL function for MT-Bench (2-turn conversation).""" + return create_multi_turn_sgl_function( + function_name="answer_mt_bench", + system_prompt=SYSTEM_PROMPT, + num_turns=2, + max_tokens=self.get_max_new_tokens(), + ) + + def get_answer_keys(self) -> List[str]: + """Return answer keys for multi-turn conversation.""" + return ["answer_1", "answer_2"] diff --git a/idea1/benchmarks/benchmarker/registry.py b/idea1/benchmarks/benchmarker/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4f474fcd15bd9a891b8f8977465aaa233c9fd1 --- /dev/null +++ b/idea1/benchmarks/benchmarker/registry.py @@ -0,0 +1,31 @@ +class BenchmarkRegistry: + + def __init__(self): + self.benchmarks = {} + + def register(self, name: str): + """ + Usage: + ```python + BENCHMARKS = BenchmarkRegistry() + + BENCHMARKS.register("aime") + class AIMEBenchmarker(Benchmarker): + ... + ``` + """ + + def wrapper(cls): + self.benchmarks[name] = cls + return cls + + return wrapper + + def get(self, name: str) -> type: + """ + Get the benchmark class by name. + """ + return self.benchmarks[name] + + +BENCHMARKS = BenchmarkRegistry() diff --git a/idea1/benchmarks/benchmarker/simpleqa.py b/idea1/benchmarks/benchmarker/simpleqa.py new file mode 100644 index 0000000000000000000000000000000000000000..5facab00d719d6d235a8cb50d161679ebe28f6a0 --- /dev/null +++ b/idea1/benchmarks/benchmarker/simpleqa.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, List, Optional, Tuple + +from datasets import load_dataset + +from .base import Benchmarker +from .registry import BENCHMARKS +from .utils import create_simple_sgl_function + + +def generate_question(row: Dict[str, Any]) -> str: + question = row["problem"].strip() + return question + + +@BENCHMARKS.register("simpleqa") +class SimpleQABenchmarker(Benchmarker): + """SimpleQA benchmark implementation.""" + + def __init__(self, num_samples: Optional[int] = None): + super().__init__(num_samples, None) + + def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]: + # Read data + ds = load_dataset("basicv8vc/SimpleQA")["test"] + + questions = [] + labels = [] + for i in range((len(ds))): + if self.num_samples is not None and i >= self.num_samples: + break + + question_text = generate_question(ds[i]) + questions.append({"question": question_text}) + labels.append(None) + return questions, labels + + def create_sgl_function(self): + return create_simple_sgl_function( + function_name="get_simpleqa_answer", + answer_key="answer", + max_tokens=self.get_max_new_tokens(), + ) diff --git a/idea1/benchmarks/benchmarker/utils.py b/idea1/benchmarks/benchmarker/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b6a6dabfb9a4ef1789b7a89a5d7131755a6e6fa8 --- /dev/null +++ b/idea1/benchmarks/benchmarker/utils.py @@ -0,0 +1,273 @@ +""" +Utility functions for benchmark scripts. +""" + +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional + +import numpy as np +import sglang as sgl + + +@dataclass +class BenchmarkMetrics: + """Container for benchmark performance metrics.""" + + latency: float + output_throughput: float + accept_length: float + accuracy: Optional[float] = None + num_questions: int = 0 + num_valid_predictions: int = 0 + categorical_performance: Optional[Dict[str, "BenchmarkMetrics"]] = None + + +def compute_metrics( + states: List[Any], + latency: float, + answer_key: str = "answer", + additional_answer_keys: Optional[List[str]] = None, +) -> BenchmarkMetrics: + """ + Compute performance metrics from SGLang states. + + Args: + states: List of SGLang state objects from run_batch + latency: Total latency in seconds + answer_key: Primary key for answer in state meta info + additional_answer_keys: Additional keys to include in token count (e.g., ["answer_1", "answer_2"]) + + Returns: + BenchmarkMetrics object with computed metrics + """ + # Compute output tokens + num_output_tokens = 0 + if additional_answer_keys: + for key in [answer_key] + additional_answer_keys: + num_output_tokens += sum( + s.get_meta_info(key)["completion_tokens"] for s in states + ) + else: + num_output_tokens = sum( + s.get_meta_info(answer_key)["completion_tokens"] for s in states + ) + + output_throughput = num_output_tokens / latency if latency > 0 else 0.0 + + # Compute accept length (speculative decoding metric) + has_verify = "spec_verify_ct" in states[0].get_meta_info(answer_key) + if has_verify: + num_verify_tokens = 0 + if additional_answer_keys: + for key in [answer_key] + additional_answer_keys: + num_verify_tokens += sum( + s.get_meta_info(key).get("spec_verify_ct", 0) for s in states + ) + else: + num_verify_tokens = sum( + s.get_meta_info(answer_key).get("spec_verify_ct", 0) for s in states + ) + + if num_verify_tokens == 0: + accept_length = 1.0 + else: + accept_length = num_output_tokens / num_verify_tokens + else: + accept_length = 1.0 + + return BenchmarkMetrics( + latency=latency, + output_throughput=output_throughput, + accept_length=accept_length, + num_questions=len(states), + ) + + +def print_results( + metrics_list: List[BenchmarkMetrics], + benchmark_name: str, + show_accuracy: bool = False, +): + """ + Print benchmark results in a formatted way. + + Args: + metrics_list: List of BenchmarkMetrics from multiple runs + benchmark_name: Name of the benchmark + show_accuracy: Whether to show accuracy metrics + """ + avg_latency = np.mean([m.latency for m in metrics_list]) + avg_throughput = np.mean([m.output_throughput for m in metrics_list]) + avg_accept_length = np.mean([m.accept_length for m in metrics_list]) + + print(f"\n{'='*50}") + print(f"{benchmark_name} Evaluation Results") + print(f"{'='*50}") + print(f"Number of questions: {metrics_list[0].num_questions}") + if show_accuracy: + if metrics_list[0].accuracy is not None: + avg_accuracy = np.mean( + [m.accuracy for m in metrics_list if m.accuracy is not None] + ) + print(f"Average Accuracy: {avg_accuracy:.4f} ({avg_accuracy*100:.2f}%)") + else: + print(f"Average Accuracy: None") + print(f"Average Latency: {avg_latency:.3f} s") + print(f"Average Output throughput: {avg_throughput:.3f} token/s") + print(f"Average Accept length: {avg_accept_length:.3f}") + print(f"{'='*50}\n") + + +def create_simple_sgl_function( + function_name: str = "get_answer", + answer_key: str = "answer", + system_prompt: Optional[str] = None, + max_tokens: int = 2048, + stop: Optional[List[str]] = None, + user_prefix: Optional[str] = None, +) -> Callable: + """ + Create a simple SGL function for single-turn Q&A. + + Args: + function_name: Name of the function + answer_key: Key for storing the answer + system_prompt: Optional system prompt + max_tokens: Maximum tokens to generate + stop: Optional stop sequences + user_prefix: Optional suffix to append to user message (appended after question) + + Returns: + SGL function decorated with @sgl.function + """ + + @sgl.function + def sgl_func(s, question): + if system_prompt: + s += sgl.system(system_prompt) + user_content = question + if user_prefix: + user_content = question + user_prefix + s += sgl.user(user_content) + gen_kwargs = {"max_tokens": max_tokens} + if stop: + gen_kwargs["stop"] = stop + s += sgl.assistant(sgl.gen(answer_key, **gen_kwargs)) + + sgl_func.__name__ = function_name + return sgl_func + + +def create_few_shot_sgl_function( + few_shot_examples: str, + function_name: str = "few_shot_answer", + answer_key: str = "answer", + max_tokens: int = 512, + stop: Optional[List[str]] = None, +) -> Callable: + """ + Create an SGL function for few-shot learning. + + Args: + few_shot_examples: String containing few-shot examples + function_name: Name of the function + answer_key: Key for storing the answer + max_tokens: Maximum tokens to generate + stop: Optional stop sequences + + Returns: + SGL function decorated with @sgl.function + """ + + @sgl.function + def sgl_func(s, question): + s += few_shot_examples + question + gen_kwargs = {"max_tokens": max_tokens} + if stop: + gen_kwargs["stop"] = stop + s += sgl.gen(answer_key, **gen_kwargs) + + sgl_func.__name__ = function_name + return sgl_func + + +def create_multi_turn_sgl_function( + function_name: str = "multi_turn_answer", + system_prompt: Optional[str] = None, + num_turns: int = 2, + max_tokens: int = 2048, +) -> Callable: + """ + Create an SGL function for multi-turn conversations (e.g., MT-Bench with 2 turns). + + Args: + function_name: Name of the function + system_prompt: Optional system prompt + num_turns: Number of conversation turns (default: 2) + max_tokens: Maximum tokens to generate per turn + + Returns: + SGL function decorated with @sgl.function + """ + if num_turns == 2: + # Most common case: 2-turn conversation + @sgl.function + def sgl_func(s, question_1, question_2): + if system_prompt: + s += sgl.system(system_prompt) + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=max_tokens)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=max_tokens)) + + else: + # Generic case: create function with dynamic number of turns + # Note: This requires the caller to pass arguments as a dict + @sgl.function + def sgl_func(s, **kwargs): + if system_prompt: + s += sgl.system(system_prompt) + for i in range(num_turns): + question_key = f"question_{i+1}" + answer_key = f"answer_{i+1}" + if question_key in kwargs: + s += sgl.user(kwargs[question_key]) + s += sgl.assistant(sgl.gen(answer_key, max_tokens=max_tokens)) + + sgl_func.__name__ = function_name + return sgl_func + + +def create_image_sgl_function( + function_name: str = "get_image_answer", + answer_key: str = "answer", + max_tokens: int = 2048, +) -> Callable: + """ + Create an SGL function for image-based Q&A. + + Args: + function_name: Name of the function + answer_key: Key for storing the answer + max_tokens: Maximum tokens to generate + + Returns: + SGL function decorated with @sgl.function + """ + + @sgl.function + def sgl_func(s, image_path, question, **kwargs): + """ + The body of the SGL function: constructs a multimodal conversation flow. + + - First, it inputs an image + text question as 'user'. + - Then, it generates an answer as 'assistant', binding the response to the specified `answer_key`. + + Note: sgl.image() automatically encodes the image into a format supported by the model for multimodal input. + """ + # User input: Image + Text question + s += sgl.user(sgl.image(image_path) + question) + s += sgl.assistant(sgl.gen(answer_key, max_tokens=max_tokens)) + + sgl_func.__name__ = function_name + return sgl_func diff --git a/idea1/cache/processed_dataset/tmpx7bu2kuv b/idea1/cache/processed_dataset/tmpx7bu2kuv new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/idea1/configs/deepseek-v2-lite-eagle3.json b/idea1/configs/deepseek-v2-lite-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..da12c0fb4444a55773ac0f84f4360f3476a39d09 --- /dev/null +++ b/idea1/configs/deepseek-v2-lite-eagle3.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 100000, + "eos_token_id": 100001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 10944, + "max_position_embeddings": 163840, + "max_window_layers": 64, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 16, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "beta_fast": 32.0, + "beta_slow": 1.0, + "factor": 40.0, + "mscale": 0.707, + "mscale_all_dim": 0.707, + "original_max_position_embeddings": 4096, + "rope_type": "yarn" + }, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 102400, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/deepseek-v3-671b-eagle3.json b/idea1/configs/deepseek-v3-671b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..147a5fdcd32c7ccd83248eec16dc709ed34e8bce --- /dev/null +++ b/idea1/configs/deepseek-v3-671b-eagle3.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 29, + 57 + ], + "use_aux_hidden_state": true + }, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 7168, + "initializer_range": 0.02, + "intermediate_size": 40960, + "max_position_embeddings": 163840, + "model_type": "llama", + "num_attention_heads": 56, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.51.0", + "use_cache": true, + "vocab_size": 129280, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/gemma3-1b-eagle3.json b/idea1/configs/gemma3-1b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e74eb16a3e47ac9ff4357106ff7c2afe4186da --- /dev/null +++ b/idea1/configs/gemma3-1b-eagle3.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 1152, + "initializer_range": 0.02, + "intermediate_size": 6912, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 4, + "num_hidden_layers": 1, + "num_key_value_heads": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": 512, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.50.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 262145, + "draft_vocab_size": 32000, + "target_model_type": "gemma3_text" +} diff --git a/idea1/configs/gpt-oss-120B-eagle3.json b/idea1/configs/gpt-oss-120B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b36c7687620c95e90b4ec43ee8a53763826954 --- /dev/null +++ b/idea1/configs/gpt-oss-120B-eagle3.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 17, + 33 + ] + }, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initializer_range": 0.02, + "intermediate_size": 17280, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": true, + "vocab_size": 201088, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/gpt-oss-20B-eagle3.json b/idea1/configs/gpt-oss-20B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..e1d4b257d9644032488a31a67aca8719ffdbe33e --- /dev/null +++ b/idea1/configs/gpt-oss-20B-eagle3.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 11, + 21 + ] + }, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initializer_range": 0.02, + "intermediate_size": 17280, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": true, + "vocab_size": 201088, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/ling-flash-2.0-eagle3.json b/idea1/configs/ling-flash-2.0-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a9bea37c06ae29010eade7cd4b70cdf4e9e0316 --- /dev/null +++ b/idea1/configs/ling-flash-2.0-eagle3.json @@ -0,0 +1,24 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "bos_token_id": 163584, + "eos_token_id": 163585, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.57.1", + "use_cache": true, + "vocab_size": 157184, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/llama3-70B-ealge3.json b/idea1/configs/llama3-70B-ealge3.json new file mode 100644 index 0000000000000000000000000000000000000000..20d04f4d0dc09fe2894a7a35673b3a8afdaa8e32 --- /dev/null +++ b/idea1/configs/llama3-70B-ealge3.json @@ -0,0 +1,37 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 4096, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 128256, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/llama3-8B-eagle3.json b/idea1/configs/llama3-8B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..775ad6afee3c43946742b823b8f4e3d48af68b3c --- /dev/null +++ b/idea1/configs/llama3-8B-eagle3.json @@ -0,0 +1,24 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "bos_token_id": 128000, + "eos_token_id": 128001, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 2048, + "model_type": "llama", + "num_attention_heads": 32, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 128256, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/llama4-scout-17B-16E-eagle3.json b/idea1/configs/llama4-scout-17B-16E-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..9c2bb5a81a3b5452836b0c6dcf1ba29e4ecc64e5 --- /dev/null +++ b/idea1/configs/llama4-scout-17B-16E-eagle3.json @@ -0,0 +1,22 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 32768, + "max_position_embeddings": 2048, + "model_type": "llama", + "num_attention_heads": 40, + "num_key_value_heads": 8, + "num_hidden_layers": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": true, + "vocab_size": 202048, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/longcat-flash-dflash.json b/idea1/configs/longcat-flash-dflash.json new file mode 100644 index 0000000000000000000000000000000000000000..66e9b33a614a15dc3c5df35d9f6cb8aabe818d61 --- /dev/null +++ b/idea1/configs/longcat-flash-dflash.json @@ -0,0 +1,45 @@ +{ + "architectures": [ + "DFlashDraftModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 1, + "dflash_config": { + "mask_token_id": 2, + "target_layer_ids": [1, 7, 13, 19, 25] + }, + "dtype": "bfloat16", + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 5, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "num_target_layers": 28, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 131072 + } diff --git a/idea1/configs/longcat-flash-eagle3.json b/idea1/configs/longcat-flash-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..7b3b921a22378353f010d1ee1ba03ec44610eb75 --- /dev/null +++ b/idea1/configs/longcat-flash-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 131072, + "max_window_layers": 48, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 1, + "num_key_value_heads":16, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 131072, + "draft_vocab_size": 131072 + } diff --git a/idea1/configs/phi4-eagle3.json b/idea1/configs/phi4-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..05456a0d239653cdc898413860c6822d8a7cdec5 --- /dev/null +++ b/idea1/configs/phi4-eagle3.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 100257, + "eos_token_id": 100257, + "pad_token_id": 100257, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17920, + "max_position_embeddings": 16384, + "model_type": "phi3", + "num_attention_heads": 40, + "num_hidden_layers": 1, + "num_key_value_heads": 10, + "rms_norm_eps": 1e-05, + "rope_theta": 250000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "use_cache": true, + "vocab_size": 100352, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen2-5-vl-7b-eagle3.json b/idea1/configs/qwen2-5-vl-7b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..672193e3b1284badcb747356f1cbfcd402e19ccf --- /dev/null +++ b/idea1/configs/qwen2-5-vl-7b-eagle3.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 8192, + "max_window_layers": 28, + "model_type": "llama", + "target_model_type": "qwen2_5_vl", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "pretraining_tp": 1, + "rope_scaling": { + "type": "mrope", + "mrope_section": [ + 16, + 24, + 24 + ] + }, + "rope_theta": 1000000, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 32000 + } diff --git a/idea1/configs/qwen2.5-7b-eagle3.json b/idea1/configs/qwen2.5-7b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..f16f6b8d07b120734f1eafd8c2e7881e424a57a1 --- /dev/null +++ b/idea1/configs/qwen2.5-7b-eagle3.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "llama", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 16000 +} diff --git a/idea1/configs/qwen2.5-vl-32b-eagle3.json b/idea1/configs/qwen2.5-vl-32b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..76aa04cdf7cdf706443308f72f5e487cf6f510ff --- /dev/null +++ b/idea1/configs/qwen2.5-vl-32b-eagle3.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 8192, + "max_window_layers": 28, + "model_type": "llama", + "target_model_type": "qwen2_5_vl", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "pretraining_tp": 1, + "rope_scaling": { + "type": "mrope", + "mrope_section": [ + 16, + 24, + 24 + ] + }, + "rope_theta": 1000000, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 32000 + } diff --git a/idea1/configs/qwen3-235B-A22B-eagle3.json b/idea1/configs/qwen3-235B-A22B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..8e28c04a18a851c968252b1691b89dcdcff598b9 --- /dev/null +++ b/idea1/configs/qwen3-235B-A22B-eagle3.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "eagle_config": { + "eagle_aux_hidden_state_layer_ids": [ + 1, + 46, + 90 + ], + "use_aux_hidden_state": true + }, + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "draft_vocab_size": 32000, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 24576, + "max_position_embeddings": 40960, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "rope_scaling": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "vocab_size": 151936 +} diff --git a/idea1/configs/qwen3-30B-A3B-eagle3.json b/idea1/configs/qwen3-30B-A3B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..558cb18043a5bd182497536203de90a4a7672f35 --- /dev/null +++ b/idea1/configs/qwen3-30B-A3B-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 2048, + "max_window_layers": 48, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads":4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-32b-eagle3.json b/idea1/configs/qwen3-32b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..cf128d9fb451833207c0a4293554357f324aea8c --- /dev/null +++ b/idea1/configs/qwen3-32b-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 25600, + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-4b-eagle3.json b/idea1/configs/qwen3-4b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..41ae128fdcd532f1e31c6251819d29aedfa9d3e6 --- /dev/null +++ b/idea1/configs/qwen3-4b-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-8b-dflash.json b/idea1/configs/qwen3-8b-dflash.json new file mode 100644 index 0000000000000000000000000000000000000000..518860725a65bae6674c0af60643394ef174f2d9 --- /dev/null +++ b/idea1/configs/qwen3-8b-dflash.json @@ -0,0 +1,45 @@ +{ + "architectures": [ + "DFlashDraftModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 151643, + "dflash_config": { + "mask_token_id": 151669, + "target_layer_ids": [1, 9, 17, 25, 33] + }, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 5, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "num_target_layers": 36, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/idea1/configs/qwen3-8b-eagle3.json b/idea1/configs/qwen3-8b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1fa44906d6decad8ccee5c8296699b1db5750f1 --- /dev/null +++ b/idea1/configs/qwen3-8b-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads":8 , + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-coder-30B-A3B-instruct-eagle3.json b/idea1/configs/qwen3-coder-30B-A3B-instruct-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..f296c237973a83f40f4540a97bbc193e2593bb44 --- /dev/null +++ b/idea1/configs/qwen3-coder-30B-A3B-instruct-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 2048, + "max_window_layers": 48, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-coder-480B-A35B-instruct-eagle3.json b/idea1/configs/qwen3-coder-480B-A35B-instruct-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..2f27c80cc017e811f8846f2161a977725e669086 --- /dev/null +++ b/idea1/configs/qwen3-coder-480B-A35B-instruct-eagle3.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 262144, + "max_window_layers": 62, + "model_type": "llama", + "num_attention_heads": 96, + "num_hidden_layers": 1, + "num_key_value_heads":8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-coder-next-eagle3.json b/idea1/configs/qwen3-coder-next-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..32d0e7f04d13273a9f3e654b88998fb0060f1143 --- /dev/null +++ b/idea1/configs/qwen3-coder-next-eagle3.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "use_cache": true, + "vocab_size": 151936, + "draft_vocab_size": 32000 +} diff --git a/idea1/configs/qwen3-next-80b-a3b-eagle3.json b/idea1/configs/qwen3-next-80b-a3b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..e94a2ea3407d784ee9fbd4b6a15b96cd7cadfec8 --- /dev/null +++ b/idea1/configs/qwen3-next-80b-a3b-eagle3.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "decoder_sparse_step": 1, + "eos_token_id": 151645, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 262144, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.57.0.dev0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936, + "draft_vocab_size": 32000 + } diff --git a/idea1/configs/qwen3.5-35b-a3b-dflash.json b/idea1/configs/qwen3.5-35b-a3b-dflash.json new file mode 100644 index 0000000000000000000000000000000000000000..853722ff3526de36ccacd515b40c6b645c2dff07 --- /dev/null +++ b/idea1/configs/qwen3.5-35b-a3b-dflash.json @@ -0,0 +1,48 @@ +{ + "architectures": [ + "DFlashDraftModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 151643, + "dflash_config": { + "mask_token_id": 248070, + "target_layer_ids": [1, 10, 19, 28, 37] + }, + "dtype": "bfloat16", + "eos_token_id": 248046, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 8, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 8, + "num_key_value_heads": 4, + "num_target_layers": 40, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 248320 +} diff --git a/idea1/configs/qwen3.5-35b-a3b-eagle3.json b/idea1/configs/qwen3.5-35b-a3b-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..803962836d1145d40350b4e3c552446e5f3e81c6 --- /dev/null +++ b/idea1/configs/qwen3.5-35b-a3b-eagle3.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "decoder_sparse_step": 1, + "eos_token_id": 248044, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 262144, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 1, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.57.0.dev0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 248320, + "draft_vocab_size": 32000 + } diff --git a/idea1/configs/qwq-32B-eagle3.json b/idea1/configs/qwq-32B-eagle3.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7d7908d5433c886a1725c1ec456f032ba80202 --- /dev/null +++ b/idea1/configs/qwq-32B-eagle3.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "LlamaForCausalLMEagle3" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 27648, + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "qwen2", + "num_attention_heads": 40, + "num_hidden_layers": 1, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.43.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064, + "draft_vocab_size": 32000 +} diff --git a/idea1/datasets/README.md b/idea1/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8ddbef6d72d759dc06d8e59c15ef73c0ec29c204 --- /dev/null +++ b/idea1/datasets/README.md @@ -0,0 +1,5 @@ +## Store Comprehensive Datasets Download Scripts + +| DatasetName | Github | Huggingface | command | +| -------- | -------- | -------- | -------- | +| ALLaVA-4V | [link](https://github.com/FreedomIntelligence/ALLaVA) | [link](https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V) | download_laion.sh | diff --git a/idea1/datasets/download_laion.sh b/idea1/datasets/download_laion.sh new file mode 100644 index 0000000000000000000000000000000000000000..a64d061ebb5de06b2e87cfc3bcd2b38508b7009e --- /dev/null +++ b/idea1/datasets/download_laion.sh @@ -0,0 +1,36 @@ + + +laion_root="allava_laion" + +mkdir $laion_root +cd $laion_root + + +# 1. download annotation files +## 1.1 caption +wget -c -O ALLaVA-Caption-LAION-4V.json https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/ALLaVA-Caption-LAION-4V.json?download=true + +## 1.2 instruction +wget -c -O ALLaVA-Instruct-LAION-4V.json https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/ALLaVA-Instruct-LAION-4V.json?download=true + + +# 2. download and upzip images +mkdir image_chunks + +## 2.1 download +for ((i=0; i<10; i++)) +do + wget -c -O image_chunks/images_$i.zip https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/image_chunks/images_$i.zip?download=true & +done + +mkdir -p images/ +wait + +## 2.2 unzip +for ((i=0; i<10; i++)) +do + unzip -j -o image_chunks/images_$i.zip -d images/ & # wait patiently, it takes a while... +done + +wait +echo "All done!" diff --git a/idea1/docs/Makefile b/idea1/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6b8792c428564ace773add1f751f7c2471a8fe83 --- /dev/null +++ b/idea1/docs/Makefile @@ -0,0 +1,58 @@ +# Minimal Makefile for Sphinx documentation +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SPHINXAUTOBUILD ?= sphinx-autobuild +SOURCEDIR = . +BUILDDIR = _build +PORT ?= 8003 + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @echo "" + @echo "Additional targets:" + @echo " serve to build and serve documentation with auto-build and live reload" + +# Compile Notebook files and record execution time +compile: + @set -e; \ + echo "Starting Notebook compilation..."; \ + mkdir -p logs; \ + echo "Notebook execution timings:" > logs/timing.log; \ + START_TOTAL=$$(date +%s); \ + find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \ + parallel -0 -j3 --halt soon,fail=1 ' \ + NB_NAME=$$(basename {}); \ + START_TIME=$$(date +%s); \ + retry --delay=0 --times=2 -- \ + jupyter nbconvert --to notebook --execute --inplace "{}" \ + --ExecutePreprocessor.timeout=600 \ + --ExecutePreprocessor.kernel_name=python3; \ + RET_CODE=$$?; \ + END_TIME=$$(date +%s); \ + ELAPSED_TIME=$$((END_TIME - START_TIME)); \ + echo "$${NB_NAME}: $${ELAPSED_TIME}s" >> logs/timing.log; \ + exit $$RET_CODE' || exit 1; \ + END_TOTAL=$$(date +%s); \ + TOTAL_ELAPSED=$$((END_TOTAL - START_TOTAL)); \ + echo "---------------------------------" >> logs/timing.log; \ + echo "Total execution time: $${TOTAL_ELAPSED}s" >> logs/timing.log; \ + echo "All Notebook execution timings:" && cat logs/timing.log + +# Serve documentation with auto-build and live reload +serve: + @echo "Starting auto-build server at http://0.0.0.0:$(PORT)" + @$(SPHINXAUTOBUILD) "$(SOURCEDIR)" "$(BUILDDIR)/html" \ + --host 0.0.0.0 \ + --port $(PORT) \ + --watch $(SOURCEDIR) \ + --re-ignore ".*\.(ipynb_checkpoints|pyc|pyo|pyd|git)" + +.PHONY: help Makefile compile clean serve + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + find . -name "*.ipynb" -exec nbstripout {} \; + rm -rf $(BUILDDIR) + rm -rf logs diff --git a/idea1/docs/README.md b/idea1/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..592f0e51a0f9be1b4aa959867fb526ed4003c149 --- /dev/null +++ b/idea1/docs/README.md @@ -0,0 +1,55 @@ +# SpecForge Documentation + +We recommend new contributors to start from writing documentation, which helps you quickly understand the SpecForge codebase. +Most documentation files are located under the `docs/` folder. + +## Docs Workflow + +### Install Dependency + +```bash +apt-get update && apt-get install -y pandoc parallel retry +pip install -r requirements.txt +``` + +### Update Documentation + +Update your Jupyter notebooks in the appropriate subdirectories under `docs/`. If you add new files, remember to update `index.rst` (or relevant `.rst` files) accordingly. + +- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request. + +```bash +# 1) Compile all Jupyter notebooks +make compile # This step can take a long time (10+ mins). You can consider skipping this step if you can make sure your added files are correct. +make html + +# 2) Compile and Preview documentation locally with auto-build +# This will automatically rebuild docs when files change +# Open your browser at the displayed port to view the docs +bash serve.sh + +# 2a) Alternative ways to serve documentation +# Directly use make serve +make serve +# With custom port +PORT=8080 make serve + +# 3) Clean notebook outputs +# nbstripout removes notebook outputs so your PR stays clean +pip install nbstripout +find . -name '*.ipynb' -exec nbstripout {} \; + +# 4) Pre-commit checks and create a PR +# After these checks pass, push your changes and open a PR on your branch +pre-commit run --all-files +``` +--- + +## Documentation Style Guidelines + +- For common functionalities, we prefer **Jupyter Notebooks** over Markdown so that all examples can be executed and validated by our docs CI pipeline. For complex features (e.g., distributed serving), Markdown is preferred. +- Keep in mind the documentation execution time when writing interactive Jupyter notebooks. Each interactive notebook will be run and compiled against every commit to ensure they are runnable, so it is important to apply some tips to reduce the documentation compilation time: + - Use small models (e.g., `qwen/qwen2.5-0.5b-instruct`) for most cases to reduce server launch time. + - Reuse the launched server as much as possible to reduce server launch time. +- Do not use absolute links (e.g., `https://docs.sglang.ai/get_started/install.html`). Always prefer relative links (e.g., `../get_started/install.md`). +- Follow the existing examples to learn how to launch a server, send a query and other common styles. diff --git a/idea1/docs/_static/css/custom_log.css b/idea1/docs/_static/css/custom_log.css new file mode 100644 index 0000000000000000000000000000000000000000..61f65d0199df9e97886560f7f97c6c9b026bd34e --- /dev/null +++ b/idea1/docs/_static/css/custom_log.css @@ -0,0 +1,29 @@ +.output_area { + color: #615656; +} + +table.autosummary td { + width: 50% + } + + img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +.output_area.stderr { + color: #d3d3d3 !important; +} + +.output_area.stdout { + color: #d3d3d3 !important; +} + +div.output_area.stderr { + color: #d3d3d3 !important; +} + +div.output_area.stdout { + color: #d3d3d3 !important; +} diff --git a/idea1/docs/_static/css/readthedocs.css b/idea1/docs/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..aca6649b436a35cf39b2c924ce2f74ed2cdc8b90 --- /dev/null +++ b/idea1/docs/_static/css/readthedocs.css @@ -0,0 +1,9 @@ +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} diff --git a/idea1/docs/advanced_features/customization.md b/idea1/docs/advanced_features/customization.md new file mode 100644 index 0000000000000000000000000000000000000000..47b624a9ce461b5f37aa6c159bcb306657c68ed4 --- /dev/null +++ b/idea1/docs/advanced_features/customization.md @@ -0,0 +1,118 @@ +# 💡 Customize Your Own Training + +## 🔧 Customize Training Args + +```bash +torchrun \ + --standalone \ + --nproc_per_node 8 \ + ./scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config ./configs/llama3-8B-eagle3.json \ + --train-data-path ./cache/dataset/sharegpt.jsonl \ + --output-dir ./outputs/llama3-8b-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 2048 \ + --chat-template llama3 \ + --cache-dir ./cache +``` + +If you wish to understand what each argument does, you can run `python scripts/train_eagle3.py --help` to see the full list of arguments. Particularly, we will discuss some important arguments below. +- `--chat-template`: This should be the chat template to use for the model, so please make sure you set it to the correct value. +- `--cache-dir`: This directory contains the dataset cache including the `input_ids`, `loss_mask`, `attention_mask` and `vocab_mapping`. These caches can make your data loading much faster once a cache is generated. The cache file has a name which is obtained by hashing the dataset path to avoid cache collision. + +## 💬 Customize Chat Template + +You can register a new chat template for your model by adding a new entry to the `TEMPLATE_REGISTRY` in the `specforge.data.template.py` file. + +```python +TEMPLATE_REGISTRY.register( + name="your-template-name", + template=ChatTemplate( + assistant_header="xxx", + user_header="xxx", + system_prompt="xxx", + end_of_turn_token="xxx", + ), +) +``` + +## 🪅 Customize Model + +### Customize Target Model + +If you wish to train Eagle3 for other models, you need to modify the `--target-model-path` value. We support loading these models directly from HuggingFace. + +However, if your model is too large and requires tensor parallelism, you can implement its tensor parallel version on your own in the `specforge.modeling.target` directory. The CausalLM model should inherit the `DistributedTargetModel` class in the `specforge.modeling.target.base.py` file and apply `ColumnParallelLinear` and `RowParallelLinear` to its submodules. + +```python +from .base import DistributedTargetModel +from specforge.layers.linear import ColumnParallelLinear, RowParallelLinear + + +class MyModelForCausalLM(MyModelPreTrainedModel, GenerationMixin, DistributedTargetModel): + ... + + def load_weights(self, state_dict: Dict[str, torch.Tensor]): + ... +``` + +Afterwards, you need to register this model to the `AutoEagle3TargetModel` class in the `specforge.modeling.auto.py` file. + +```diff +class AutoDistributedTargetModel(AutoModelForCausalLMBase): + _model_mapping = { + Llama4TextConfig: [Llama4ForCausalLM], ++ MyModelConfig: [MyModelForCausalLM], + } +``` + +When `tp_size` is greater than 1, the script will automatically load the distributed version of the model for tensor parallelism. + +### Customize Draft Model + +If you want to change the draft model configuration, you can write your own configuration file and pass its path to the `--draft-model-config` argument. Or, if you do not provide the `--draft-model-config` argument, the script will automatically generate the draft model configuration based on the target model configuration. If you wish to serve your customized draft model with SGLang, make sure you implement the draft model in SGLang as well and the architecture name must match. To implement your own draft model, you can create a new class and inherit it from the `Eagle3DraftModel` class in the `specforge.modeling.draft.base.py` file. + + +```python +from .base import Eagle3DraftModel +from transformers import PretrainedConfig + + +class MyModelConfig(PretrainedConfig): + model_type = "mymodel" + + def __init__(self, **kwargs): + ... + + +class MyModelEagle3(Eagle3DraftModel): + + config_class = MyModelConfig + + def __init__(self, config, quant_config=None) -> None: + ... +``` + +You can then register these models to the `AutoEagle3TargetModel` and `AutoDraftModelConfig` classes in the `specforge.modeling.auto.py` file for the automatic model loading. + +```diff +class AutoEagle3DraftModel(AutoModelForCausalLMBase): + # the model mapping is currently hardcoded, we should support lazy model mapping via registry + _model_mapping = { + LlamaConfig: [LlamaForCausalLMEagle3], ++ MyModelConfig: MyModelEagle3, + } + + +class AutoDraftModelConfig: + + _config_mapping = { + "LlamaForCausalLMEagle3": LlamaConfig, ++ "MyModelEagle3": MyModelConfig, + } +``` + +In this way, as long as your `config.json` specifies the correct architecture name, the script will automatically load the correct draft model for you. diff --git a/idea1/docs/basic_usage/data_preparation.md b/idea1/docs/basic_usage/data_preparation.md new file mode 100644 index 0000000000000000000000000000000000000000..e19938add815ada108d47a1d0c54d3545e63e7fc --- /dev/null +++ b/idea1/docs/basic_usage/data_preparation.md @@ -0,0 +1,128 @@ +# 📝 Data Preparation + +## 📍 Overview + +Data is an important aspect of speculative decoding as the quality of the dataset directly affects the acceptance rate of the draft model. In this section, we will introduce how to prepare the dataset for both online and offline training. + +## ☁️ Pre-supported Datasets + +We have provided a script to prepare some sample datasets out of the box, these datasets include: +1. [ultrachat](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) (200k) +2. [sharegpt](https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered) (120k) +3. [perfectblend](https://huggingface.co/datasets/mlabonne/open-perfectblend) (1.4M) +4. and others (we continuously add support for more datasets) + +You can run the script below to prepare the corresponding dataset. + +```bash +# ultrachat +python scripts/prepare_data.py --dataset ultrachat + +# sharegpt +python scripts/prepare_data.py --dataset sharegpt +``` + +You can view the full list of pre-supported datasets using `python scripts/prepare_data.py --help`. The datasets are processed and saved as `jsonl` files in the `cache/dataset/` directory of the project path by default. + + +## ↩️ Regenerate Datasets + +When training speculative decoding draft models for a specific target model, instead of using the original dataset, we can regenerate the assistant responses using the target model to better align the draft model with the target model's output distribution. This will improve the acceptance rate of the draft model and the overall performance of the speculative decoding. According to the [EAGLE1 paper](https://arxiv.org/pdf/2401.15077), the EAGLE method is not very sensitive to the dataset quality, which means the performance is still good even if you use the original dataset. However, if you are looking for optimal performance in the production environment, it is recommended to regenerate the dataset using the target model. + +We can follow the following steps to regenerate the dataset. In the example below, we will use `meta-llama/Llama-3.1-8B-Instruct` as an example, you can replace it with your own target model. + +1. Start the SGLang server for the target model. + +```shell +python3 -m sglang.launch_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --cuda-graph-bs 1 2 4 8 16 32 64 128 \ + --dtype bfloat16 \ + --mem-frac=0.8 \ + --port 30000 +``` + +2. Regenerate the dataset using the `regenerate_train_data.py` script. + +```shell +python scripts/regenerate_train_data.py \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --concurrency 128 \ + --max-tokens 98304 \ + --server-address localhost:30000 \ + --temperature 0.8 \ + --input-file-path ./cache/dataset/sharegpt_train.jsonl \ + --output-file-path ./cache/dataset/sharegpt_train_regen.jsonl +``` + +For maximum performance, we recommend to scale the number of GPUs to regenerate the dataset in data parallel mode. To do this, you can simply add more server addresses to the `--server-address` argument, e.g. `--server-address localhost:30000 localhost:30001 localhost:30002 localhost:30003`. + + +## 🤩 Prepare your own dataset + +Besides the provided datasets, you can also prepare your own dataset. We support two formats: + +#### Option 1: Conversation Format + +You should prepare the dataset in jsonl format and the schema should look like this: + +```json +{ + "id": "xxxx", + "conversations": [ + { + "role": "user | assistant", + "content": "The message content" + } + ], +} +``` + +#### Option 2: Pre-formatted Text Format + +If you already have conversations formatted with a specific chat template, you can use the pre-formatted text directly: + +```json +{ + "id": "xxxx", + "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there!<|im_end|>\n" +} +``` + +This format is useful when you have pre-formatted prompts that were used during training of the target model and have raw generations from the target model. + +To use pre-formatted datasets, add the `--is-preformatted` flag to your training command. Note that the `--chat-template` parameter is still needed and should match the template used in your pre-formatted text, as it is used to identify user/assistant tokens to determine the assistant spans and generate the corresponding loss mask. + +```bash +# Online training with pre-formatted data +torchrun --standalone --nproc_per_node 8 \ + scripts/train_eagle3.py \ + --is-preformatted \ + --train-data-path ./your_preformatted_dataset.jsonl \ + # ... other arguments +``` + +For offline training, you can also use `--is-preformatted` when generating hidden states: + +```bash +# Generate hidden states from pre-formatted data +torchrun --nproc_per_node=8 \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --data-path ./your_preformatted_dataset.jsonl \ + --output-path ./cache/hidden_states \ + --chat-template llama3 \ + --is-preformatted \ + --max-length 2048 +``` + +Once you have the `jsonl` file ready, you can proceed with online training or generate hidden states for offline training. See the Training guide for more details. + + +## ➕ Handling Multiple Datasets + +If you have multiple datasets, you can just merge them into the one jsonl file. For example, you can do something like this + +```bash +cat dataset1.jsonl dataset2.jsonl > merged_dataset.jsonl +``` diff --git a/idea1/docs/basic_usage/training.md b/idea1/docs/basic_usage/training.md new file mode 100644 index 0000000000000000000000000000000000000000..a41b5a0dee1a9a12620f25ae26f613f4711d0b7c --- /dev/null +++ b/idea1/docs/basic_usage/training.md @@ -0,0 +1,62 @@ +## 🚀 Training + +## 📍 Overview + +Existing speculative decoding methods such as EAGLE3 requires training in the feature-space, which means the draft model relies on the hidden states generated from the target model for autoregressive prediction. In SpecForge, we provide two orthogonal paths to cater to the users' specific needs when training this kind of draft models. We name these two methods as `Online` and `Offline`. By definition, it is easy to understandd them: + +- **`Online`**: the hidden states are generated on the fly during training. +- **`Offline`**: the hidden states are generated beforehand, stored to the disk, and loaded back to GPU during training. + +Online training is suitable for users with limited disk space but sufficient GPUs while offline training is suitable for users with sufficient disk space but limited GPUs. + +| Method | Target Model | Disk Space Requirement | GPU Requirement | One-liner rationale | +| --- | --- | --- | --- | --- | +| Online | Used during training | Small | More GPUs are needed if your target model is large | Generating auxiliary hidden states on the fly | +| Offline | Only used during data preparation | Huge (e.g. ultrachat+sharegpt will need 12TB storage ) | as low as 1 GPU, as only need to accommodate the draft model | Preparing auxiliary hidden states beforehand and only once | + +> **Why does disk matter?** +> During Eagle3 training, the frozen target model will first generate the hidden states for each token given the data sample. The hidden states are fed to the draft model for training. +> Offline mode stores these hidden states to the local disk, so a small disk can be filled up fast. +> Online mode only generates these hidden states on the fly without storing them to the disk, but needs to keep the target model resident in memory during training, trading GPU RAM for almost-zero disk footprint. + +## 🏎️ Online Training + +We have provided training scripts for the EAGLE3 models in the `examples` directory. These scripts cover a wide range of models range from Llama to Qwen, small to large and dense to MoE. Online training is often conducted in two steps and we will use ShareGPT and Llama3-8B-Instruct as an example. + +**Step 1: Prepare the dataset** + +```bash +# prepare the dataset +python scripts/prepare_data.py --dataset sharegpt +``` + +**Step 2: Start the training** + +```bash +# train llama3-8B-instruct +bash ./examples/run_llama3.1_8b_eagle3_online.sh +``` + +## 💨 Offline Training + +The difference between online and offline training is that we need to generate the hidden states before training. We also use ShareGPT and Llama3-8B-Instruct as an example. + +**Step 1: Prepare the dataset** + +Same as above + +**Step 2: Generate the hidden states and train** + +```bash +# train llama3-8B-instruct in an offline manner +bash ./examples/run_llama3.1_8b_eagle3_offline.sh +``` + +It is important to note that the `run_llama3.1_8b_eagle3_offline.sh` script consists of two steps: + +1. Generate the hidden states using the `prepare_hidden_states.py` script. This script will generate the hidden states for the test and train datasets and save them to the disk. +2. Train the model: suppling the `--train-hidden-states-path` argument to the script so that the script will load the hidden states from the disk during training. + +## 📈 Experiment Tracking + +This project supports logging training progress to Wandb, TensorBoard, and SwanLab. You can enable tracking by adding the `--report-to` argument to the command line in your shell script. diff --git a/idea1/docs/benchmarks/benchmark.md b/idea1/docs/benchmarks/benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..29a51b35d5d7639ebd666202aad3377063e4ee12 --- /dev/null +++ b/idea1/docs/benchmarks/benchmark.md @@ -0,0 +1,67 @@ +# Benchmarking for Speculative Decoding + +## Overview + +We provide a unified script to test the performance of the Speculative Decoding with EAGLE3 algorithm on multiple datasets. You can follow the steps below to run the benchmarks. + +## Run Benchmarks + +### Launch SGLang and Benchmarker Concurrently + +`bench_eagle3.py` can help you launch a SGLang server process and a Benchmarking process concurrently. In this way, you don't have to launch the SGLang server manually, this script will manually handle the SGLang launch under different speculative decoding configurations. Some important arguments are: +- `--model-path`: the path to the target model. +- `--speculative-draft-model-path`: the path to the draft model. +- `--port`: the port to launch the SGLang server. +- `--trust-remote-code`: trust the remote code. +- `--mem-fraction-static`: the memory fraction for the static memory. +- `--tp-size`: the tensor parallelism size. +- `--attention-backend`: the attention backend. +- `--config-list`: the list of speculative decoding configuration to test, the format is `,,,`. +- `--benchmark-list`: the list of benchmarks to test, the format is `::`. + +```shell +python3 bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --port 30000 \ + --trust-remote-code \ + --mem-fraction-static 0.8 \ + --tp-size 1 \ + --attention-backend fa3 \ + --config-list 1,0,0,0 1,3,1,4 \ + --benchmark-list mtbench gsm8k:5 ceval:5:accountant \ + --dtype bfloat16 +``` + +### Launch Benchmarker Independently + +If you want to launch the SGLang server independently, you can use the following command. + +```shell +# you can launch a server +python3 -m sglang.launch_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --mem-fraction-static 0.75 \ + --cuda-graph-max-bs 1 \ + --tp 1 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 30000 \ + --dtype bfloat16 +``` + +Then we can start benchmarking. Note that you should use the same host and port as the one used in the SGLang server. Note that `--skip-launch-server` is required to skip the launch of the SGLang server. + +```bash +python bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --port 30000 \ + --config-list 1,3,1,4 \ + --benchmark-list mtbench:5 ceval:5:accountant gsm8k:5 humaneval:5 math500:5 mtbench:5 aime:1 \ + --skip-launch-server +``` diff --git a/idea1/docs/benchmarks/dashboard.md b/idea1/docs/benchmarks/dashboard.md new file mode 100644 index 0000000000000000000000000000000000000000..32209b54ae412153db153fd5efd9bab47db424d6 --- /dev/null +++ b/idea1/docs/benchmarks/dashboard.md @@ -0,0 +1,24 @@ +# Interactive Benchmark Dashboard + +View the interactive benchmark dashboard to explore SpecForge performance results: + +**[🚀 Spec Bundle](../dashboard/index.html)** + + +The dashboard displays the following key metrics: + +- **Acceptance Length**: Average number of tokens accepted per speculation step +- **Throughput**: Output tokens generated per second (tokens/s) +- **Speedup**: Performance improvement ratio over baseline + +## Benchmark Datasets + +View results across multiple benchmarks: +- MTBench +- HumanEval +- GSM8K +- Math500 + +--- + +If the dashboard doesn't load, please ensure JavaScript is enabled in your browser. diff --git a/idea1/docs/community_resources/dashboard.md b/idea1/docs/community_resources/dashboard.md new file mode 100644 index 0000000000000000000000000000000000000000..e7075db4bcb29e8ea3e40b16494fd7413544c3b1 --- /dev/null +++ b/idea1/docs/community_resources/dashboard.md @@ -0,0 +1,19 @@ +# 📈 Performance Dashboard + +## Overview + +To better visualize the performance of the SpecBundle draft models, we have built a dashboard to offer interactive experiences to users to explore the evaluation results. We evaluate the performance of SpecBundle draft models under different speculative decoding configurations (i.e. steps, topk, num_draft_tokens) on various benchmarks, the benchmarks include: + +- Conversation + - MTBench +- General Knowledge + - GPQA + - FinanceQA +- Math + - GSM8K + - Math500 +- Coding + - HumanEval + - LiveCodeBench + +Check out the [Performance Dashboard](https://docs.sglang.io/SpecForge/SpecBundle/index.html) for more details. diff --git a/idea1/docs/community_resources/specbundle.md b/idea1/docs/community_resources/specbundle.md new file mode 100644 index 0000000000000000000000000000000000000000..5efb84e84e72a98be42ce445eff6c0a5e7d6bcda --- /dev/null +++ b/idea1/docs/community_resources/specbundle.md @@ -0,0 +1,93 @@ +# 🔥 SpecBundle + +
+ specbundle logo +
+ + +## About SpecBundle + +Speculative decoding, especially EAGLE3, offer strong theoretical guarantees alongside consistent empirical improvements in token acceptance rate and end-to-end inference speed. However, despite these advances, adoption of speculative decoding—especially EAGLE3—remains limited in the open-source ecosystem, due primarily to three key factors. + +1. Lack of production-ready training infrastructure: Existing speculative decoding toolchains are largely research prototypes, offering limited system-level optimization and inadequate support for diverse architectures and large-scale models. +2. Scarcity of high-quality draft models: Effective speculative decoding depends on strong draft models, yet publicly available EAGLE3-compatible checkpoints are extremely limited, primarily originating from the original authors. +3. Insufficient training scale of existing drafts: Most available draft models are trained on small or curated datasets and fail to generalize to the large, diverse corpora used in modern LLM training, resulting in low token acceptance rates and diminished practical speedups. + +**SpecBundle** is a direct response to these limitations. Jointly driven by the open-source community and industry partners including **Ant Group**, **Meituan**, **Nex-AGI** and **EigenAI**, **SpecBundle** represents the **first open initiative** aimed at democratizing speculative decoding by providing high-performance, production-grade EAGLE3 draft model weights for mainstream open-source LLMs. This initiative also serves to verify the robustness of the **SpecForge** framework through multiple scales and architectures. + +We call for all open-source developers and industry partners to join this exciting initiative. + +## Performance Scores + +We evaluate the performance of SpecBundle draft models on various benchmarks, please visit the [Performance Dashboard](https://docs.sglang.io/SpecForge/SpecBundle/index.html) for more details. + +## Usage + +You can use the following command to launch the SGLang server with SpecBundle models. Please add `--tp`, `--ep` and `--mem-fraction-static` arguments when you encounter memory issues. + +```bash +python3 -m sglang.launch_server \ + --model \ + --speculative-algorithm EAGLE3 \ + --speculative-draft-model-path \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 +``` + +## Released Models + +We list the models released by the SpecForge and several industrial partners below. These models are released as part of the SpecBundle models, which are trained on large-scale multi-domain datasets and deliver exceptional performance on various benchmarks. + +> We also include some of the models previously trained by the SpecForge team but not technically part of the SpecBundle release. +> We mark models trained on ShareGPT+Ultrachat datasets with a **\*** mark and models trained on Perfect-Blend datasets but released before SpecBundle with **+** mark. + +### Llama Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| meta-llama/Llama-3.1-8B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-3.1-8B-Instruct) | +| meta-llama/Llama-3.3-70B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-3.3-70B-Instruct) | +| meta-llama/Llama-4-Scout-17B-16E-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-4-Scout-17B-16E-Instruct) | +| meta-llama/Llama-4-Maverick-17B-128E-Instruct | [🤗 Model *](https://huggingface.co/lmsys/sglang-EAGLE3-Llama-4-Maverick-17B-128E-Instruct-v1) | [🤗 Dataset](https://huggingface.co/datasets/frankleeeee/PerfectBlend-Regenerated-Llama-4-Maverick-17B-128E-Instruct) | + +### Qwen Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| Qwen/Qwen3-30B-A3B-Instruct-2507 | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge-Nex) | [🤗 Dataset](https://huggingface.co/datasets/lukeysong/qwen-30b-regen-blend) | +| Qwen/Qwen3-235B-A22B-Instruct-2507 | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge-Meituan) | [🤗 Dataset](https://huggingface.co/datasets/lukeysong/qwen3-235-regen-perfect_blend) | +| Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-perfect-blend-regenerated) | [🤗 Dataset](https://huggingface.co/datasets/lukeysong/qwen3-80b-regen-prefectblend) | + +### Qwen Coder Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| Qwen/Qwen3-Coder-30B-A3B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge) | [🤗 Dataset](https://huggingface.co/datasets/JinnP/opc_regen_Qwen3-Coder-30B-A3B-Instruct) | +| Qwen/Qwen3-Coder-480B-A35B-Instruct | [🤗 Model](https://huggingface.co/lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI) | - | + +### Ling Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| inclusionAI/Ling-flash-2.0 | [🤗 Model](https://huggingface.co/AQ-MedAI/Ling-Flash-2.0-eagle3) | - | + +### Kimi Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| moonshotai/Kimi-K2-Instruct | [🤗 Model](https://huggingface.co/AQ-MedAI/Kimi-K2-Instruct-eagle3) | - | + +### GPT-OSS Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| openai/gpt-oss-20b | [🤗 Model +](https://huggingface.co/zhuyksir/EAGLE3-gpt-oss-20b-bf16) | [🤗 Dataset](https://huggingface.co/datasets/zhuyksir/perfect-blend-gptoss-20B-1M) | +| openai/gpt-oss-120b | [🤗 Model +](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16) | - | + +### Nex Series + +| Target Model | EAGLE3 Draft Model | Regenerated Dataset | +|---------------|--------------------|--------------------| +| nex-agi/Qwen3-30B-A3B-Nex-N1 | [🤗 Model](https://huggingface.co/nex-agi/SGLANG-EAGLE3-Qwen3-30B-A3B-Nex-N1) | - | +| nex-agi/Qwen3-32B-Nex-N1 | [🤗 Model](https://huggingface.co/nex-agi/SGLANG-EAGLE3-Qwen3-32B-Nex-N1) | - | diff --git a/idea1/docs/concepts/EAGLE3.md b/idea1/docs/concepts/EAGLE3.md new file mode 100644 index 0000000000000000000000000000000000000000..fff48dc0a8c49a5e2df53917ee723a6515fb7830 --- /dev/null +++ b/idea1/docs/concepts/EAGLE3.md @@ -0,0 +1,19 @@ +# 🦅 EAGLE3 + +## 📍 Overview + +In the previous speculative decoding practices, we usually choose a small language model from the same family as the draft model. For example, we can use `Llama-3.1-8B-Instruct` as the draft model and `Llama-3.1-70B-Instruct` as the target model. However, this approach is not always feasible because the small language model may not always be available. Thus, researchers have proposed to train a separate small model as the speculator, this type of models usually use the target model's hidden states or KV cache as input to predict the next few tokens. + +Among this type of models, EAGLE3 is the state-of-the-art and has been integrated in [SGLang](https://github.com/sgl-project/sglang). It relies on the hidden states of the target model and often consists of only one dense decoder layer. Before you read on, you can revisit the details of [speculative decoding](./speculative_decoding.md) first if not familiar. + +## 🔧 How it works? + +

+ EAGLE3
+ Source: Blog by NVIDIA +

+ +The workflow of EAGLE3 is shown in the animation above. It differs from other speculative decoding methods in several ways: +1. **`Feature-based Drafting`**: Unlike other speculative decoding methods which directly feeds the tokens to the draft model to generate predictions, EAGLE3 operates in the feature space. It will extract the 3 hidden states from the target model at 3 layers at different depths and concatenate them together to form a single feature vector. This feature vector will be fed to the draft model to generate predictions. +2. **`Training-time Test`**: During training, EAGLE3 simulate the autoregressive generation process by autoregressively generating the next few tokens. It then computes the loss between the predicted output sequence and the ground truth sequence. This method improves the draft model performance because it reduces the generation errors accumulated from previous tokens for higher acceptance rate. +3. **`Dynamic Draft Tree`**: EAGLE3 uses a dynamic draft tree to store the candidate tokens as proposed in [EAGLE2](https://arxiv.org/abs/2406.16858). In simple words, it will only store the candidate tokens that are most likely to be accepted by the target model to improve the acceptance rate. diff --git a/idea1/docs/concepts/speculative_decoding.md b/idea1/docs/concepts/speculative_decoding.md new file mode 100644 index 0000000000000000000000000000000000000000..283c6167332e958c923ec6b72da58013676089e1 --- /dev/null +++ b/idea1/docs/concepts/speculative_decoding.md @@ -0,0 +1,30 @@ +# 💭 Speculative Decoding + +## 📍 Overview + +One existing challenge of LLM inference is the latency. As LLMs autoregressively generate the output token by token, the decoding process is largely bottlenecked by the memory bandwidth, i.e. the inference engine needs to load the whole model weights into memory for each token generation. The idea of speculative decoding stems from the thought that we can use a small model to predict the next few tokens in advance and let our main model to verify these tokens in sequence. As the decoding process is memory-bound, the time taken to verify multiple tokens is comparable to the time taken to generate a single token. In this way, we can speed up the decoding process significantly by speculating the next few tokens in advance. + +## 🔧 How it works? + +In speculative decoding, we have two models: +1. **`Target Model`**: a large model that is intended to serve the users, e.g. the model you want to deploy for production. +2. **`Draft Model`**: a small model that is trained to predict the next few tokens in advance. This can be in various forms, e.g. an n-gram model, a pretrained small language model (often from the same model family), a separately trained small model (EAGLE). + +

+ Drafting
+ Source: Blog by NVIDIA +

+ +The role of the draft model is to predict the next few tokens in advance, and the role of the target model is to verify the tokens predicted by the draft model. As shown in the animation above, the workflow of speculative decoding can be decomposed into 3 stages: + +- **`prefill`**: the target model will first take the prompt as the input and run the prefill stage. +- **`drafting`**: Afterwards, we let the draft model to iteratively predict the next N candidate tokens. Since the draft model is often much smaller than the target model, the drafting time is insignificant. +- **`verification`**: We then pass the N candidate tokens to the target model to verify in parallel. Since this stage is memory-bound, it does not increase the latency significantly by increasing the number of tokens. If a token is accepted by the target model, it will be added to the output sequence, otherwise, it will be discarded. The draft model will continue to predict the next tokens based on the last accepted token and this process will repeat until the end of the sequence is reached. + +One advantage of speculative decoding is that it guarantees the output distribution is the same as that of using the target model alone. This is because the target model will decide the acceptance of the candidate tokens using rejection sampling. The speculative paper has provided a mathematical proof for its correctness in the [appendix section](https://arxiv.org/pdf/2211.17192#page=10.10). +In simple words, it will only accept the candidate tokens that are most likely to be correct. Let's notate the probability of a token generated by the target model as $p(x)$ and the probability of a token generated by the draft model as $q(x)$. If $q(x) < p(x)$, then the token will be accepted. If $q(x) > p(x)$, the target model will reject the token with probability $1 - p(x)/q(x)$ and sample a new token from the distribution $p'(x) = \text{norm}(max(0, p(x) - q(x)))$. Below shows an animation of the verification process. + +

+ Verification
+ Source: Blog by NVIDIA +

diff --git a/idea1/docs/conf.py b/idea1/docs/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..f1fef2396e931693259e82aee2e78cdb77d6c256 --- /dev/null +++ b/idea1/docs/conf.py @@ -0,0 +1,188 @@ +import os +import sys +from datetime import datetime +from pathlib import Path + +sys.path.insert(0, os.path.abspath("../..")) + +DOCS_PATH = Path(__file__).parent +ROOT_PATH = DOCS_PATH.parent + +version_file = ROOT_PATH.joinpath("version.txt") +with open(version_file, "r") as f: + __version__ = f.read().strip() + +project = "SGLang" +copyright = f"2025-{datetime.now().year}, SpecForge" +author = "SpecForge Team" + +version = __version__ +release = __version__ + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.autosectionlabel", + "sphinx.ext.intersphinx", + "sphinx_tabs.tabs", + "myst_parser", + "sphinx_copybutton", + "sphinxcontrib.mermaid", + "nbsphinx", + "sphinx.ext.mathjax", +] + +nbsphinx_allow_errors = True +nbsphinx_execute = "never" + +autosectionlabel_prefix_document = True +nbsphinx_allow_directives = True + + +myst_enable_extensions = [ + "dollarmath", + "amsmath", + "deflist", + "colon_fence", + "html_image", + "substitution", +] + +myst_heading_anchors = 5 + +nbsphinx_kernel_name = "python3" +nbsphinx_execute_arguments = [ + "--InlineBackend.figure_formats={'svg', 'pdf'}", + "--InlineBackend.rc={'figure.dpi': 96}", +] + + +nb_render_priority = { + "html": ( + "application/vnd.jupyter.widget-view+json", + "application/javascript", + "text/html", + "image/svg+xml", + "image/png", + "image/jpeg", + "text/markdown", + "text/latex", + "text/plain", + ) +} + +myst_ref_domains = ["std", "py"] + +templates_path = ["_templates"] + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +master_doc = "index" + +language = "en" + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +pygments_style = "sphinx" + +html_theme = "sphinx_book_theme" +html_logo = ROOT_PATH.joinpath("assets/logo.png").as_posix() +html_favicon = ROOT_PATH.joinpath("assets/logo.ico").as_posix() +html_title = project +html_copy_source = True +html_last_updated_fmt = "" + +html_theme_options = { + "repository_url": "https://github.com/sgl-project/sgl-project.github.io", + "repository_branch": "main", + "show_navbar_depth": 3, + "max_navbar_depth": 4, + "collapse_navbar": True, + "use_edit_page_button": True, + "use_source_button": True, + "use_issues_button": True, + "use_repository_button": True, + "use_download_button": True, + "use_sidenotes": True, + "show_toc_level": 2, +} + +html_context = { + "display_github": True, + "github_user": "sgl-project", + "github_repo": "sgl-project.github.io", + "github_version": "main", + "conf_py_path": "/docs/", +} + +html_static_path = ["_static", "spec_bundle/public"] +html_css_files = ["css/custom_log.css"] + + +def setup(app): + app.add_css_file("css/custom_log.css") + + +htmlhelp_basename = "sglangdoc" + +latex_elements = {} + +latex_documents = [ + (master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"), +] + +man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)] + +texinfo_documents = [ + ( + master_doc, + "sglang", + "sglang Documentation", + author, + "sglang", + "One line description of project.", + "Miscellaneous", + ), +] + +epub_title = project + +epub_exclude_files = ["search.html"] + +copybutton_prompt_text = r">>> |\.\.\. " +copybutton_prompt_is_regexp = True + +autodoc_preserve_defaults = True +navigation_with_keys = False + +autodoc_mock_imports = [ + "torch", + "transformers", + "triton", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3.12", None), + "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "pillow": ("https://pillow.readthedocs.io/en/stable", None), + "numpy": ("https://numpy.org/doc/stable", None), + "torch": ("https://pytorch.org/docs/stable", None), +} + +html_theme = "sphinx_book_theme" + + +nbsphinx_prolog = """ +.. raw:: html + + +""" diff --git a/idea1/docs/deploy.py b/idea1/docs/deploy.py new file mode 100644 index 0000000000000000000000000000000000000000..75b7ea7f23dce0a5deb17c28d78b5cc59833a4d6 --- /dev/null +++ b/idea1/docs/deploy.py @@ -0,0 +1,22 @@ +# Deploy the documents + +import os +from datetime import datetime + + +def run_cmd(cmd): + print(cmd) + os.system(cmd) + + +run_cmd("cd $DOC_SITE_PATH; git pull") + +# (Optional) Remove old files +# run_cmd("rm -rf $ALPA_SITE_PATH/*") + +run_cmd("cp -r _build/html/* $DOC_SITE_PATH") + +cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" +run_cmd( + f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main" +) diff --git a/idea1/docs/examples/llama3-eagle3-offline.md b/idea1/docs/examples/llama3-eagle3-offline.md new file mode 100644 index 0000000000000000000000000000000000000000..a8449cc612073c9ca76804891ca9339fad9c8ca1 --- /dev/null +++ b/idea1/docs/examples/llama3-eagle3-offline.md @@ -0,0 +1,57 @@ +# Eagle3 for Llama3 - Offline + +## Introduction + +This document provides a step-by-step guide on how to train the EAGLE3 model for the Llama3.1-8B-Instruct model in an offline manner. In offline training, we generate the hidden states required by EAGLE3 draft model beforehand and store them to the disk. During training, we load them back to the GPU memory. As offline training requires a lot of disk space, we do not recommend running this on large datasets such as Perfect-Blend. + +## Training on ShareGPT dataset + +### **Step 1. Prepare ShareGPT dataset** + +First of all, we should download the dataset. + +```shell +python ./scripts/prepare_data.py --dataset sharegpt +``` + +### **Step 2. Prepare Hidden States** + +We need to prepare the hidden states for the training. + +```shell +torchrun \ + --standalone \ + --nproc_per_node 8 \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --enable-aux-hidden-states \ + --data-path ./cache/dataset/sharegpt_train.jsonl \ + --output-path ./cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --chat-template llama3 \ + --max-length 4096 \ + --tp-size 1 \ + --batch-size 32 +``` + +The hidden states will be saved to the disk in the `output-path` directory. + +### **Step 3. Start Training** + +```shell +torchrun \ + --standalone \ + --nproc_per_node 8 \ + ./scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config ./configs/llama3-8B-eagle3.json \ + --train-data-path ./cache/dataset/sharegpt_train.jsonl \ + --train-hidden-states-path ./cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --output-dir ./outputs/llama3-8b-eagle3-sharegpt-offline \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --cache-dir ./cache +``` diff --git a/idea1/docs/examples/llama3-eagle3-online.md b/idea1/docs/examples/llama3-eagle3-online.md new file mode 100644 index 0000000000000000000000000000000000000000..13dd2fdd1c9ed9b9f06505a52b5db272c5a3bd49 --- /dev/null +++ b/idea1/docs/examples/llama3-eagle3-online.md @@ -0,0 +1,75 @@ +# Eagle3 for Llama3 - Online + +## Introduction + +This document provides a step-by-step guide on how to train the EAGLE3 model for the Llama3.1-8B-Instruct model in an online manner. In online training, we generate the hidden states required by EAGLE3 draft model on the fly during training. This example is using `ShareGPT` dataset for training, the performance is not optimal due to the size and limited coverage of the dataset. If you look for optimal performance, we recommend you to try more diverse datasets such as [`Perfect-Blend`](https://huggingface.co/datasets/facebook/perfect-blend). We have also included a section on training on `Perfect-Blend` dataset at the end of this document. + + +## Training on ShareGPT dataset + +### **Step 1. Prepare ShareGPT dataset** + +First of all, we should download the dataset. + +```shell +python ./scripts/prepare_data.py --dataset sharegpt +``` + +### **Step 2. Launch Online Training** + +```shell +torchrun \ + --standalone \ + --nproc_per_node 8 \ + scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config configs/llama3-8B-eagle3.json \ + --train-data-path ./cache/dataset/sharegpt_train.jsonl \ + --output-dir ./outputs/llama3-8b-eagle3 \ + --num-epochs 2 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --target-model-backend sglang \ +``` + +### **Step 3. Benchmark** + +For `Llama3.1-8B`, we add a system prompt to all training data, following the approach used in the official repository. Consequently, when benchmarking, we should also include this system prompt to obtain the full accept length. Please uncomment the corresponding line and add the system prompt. + +The four numbers in the config represent: `batch_size, num_steps, topk, num_verify_tokens`. You can adjust the values in the config list to experiment with different test cases. + +A pre-trained EAGLE model is available at [zhuyksir/EAGLE3-Llama-3.1-8B-Instruct](https://huggingface.co/zhuyksir/EAGLE3-Llama-3.1-8B-Instruct) for reference. + +```shell +cd benchmarks + +config_list=( + "4,3,1,4" + "4,7,10,60" +) +python3 bench_eagle3.py \ + --model-path meta-llama/Llama-3.1-8B-Instruct \ + --speculative-draft-model-path /YOUR/PATH/Llama-3.1-8B-Instruct/dev_outputs/epoch_0 \ + --port 30000 \ + --mem-fraction-static 0.8 \ + --tp-size 1 \ + --config-list "${config_list[@]}" \ + --benchmark-list mtbench gsm8k humaneval math500 +``` + + +## Training on Perfect-Blend dataset + +### **Step 1. Prepare Perfect-Blend dataset** + +First of all, we should download the dataset. + +```shell +python ./scripts/prepare_data.py --dataset perfectblend +``` + +### **Step 2. Launch Online Training** + +We just need to change the `--train-data-path` to the path of the Perfect-Blend dataset (e.g. `./cache/dataset/perfectblend_train.jsonl`), then we can launch training smoothly. diff --git a/idea1/docs/get_started/about.md b/idea1/docs/get_started/about.md new file mode 100644 index 0000000000000000000000000000000000000000..e98794b56bc94d4f54cd08ce95405d4792dfb002 --- /dev/null +++ b/idea1/docs/get_started/about.md @@ -0,0 +1,13 @@ +# ⚡️ About SpecForge + +## 💡 Motivation + +Speculative decoding is an important and powerful technique for speeding up inference without losing performance. Industries have used it extensively in production to better serve their users with lower latency and higher throughput. We have seen some open-source projects for training speculative decoding models, but most of them are not well-maintained or not directly compatible with SGLang. We prepared this project because we wish that the open-source community can enjoy a speculative decoding framework that is + +- regularly maintained by the SGLang team: the code is runnable out-of-the-box +- directly compatible with SGLang: there is no additional efforts for porting to SGLang +- provide performant training capabilities: we provided online/offline/tensor-parallel/FSDP to suit your needs + +## ✅ SGLang-ready + +As SpecForge is built by the SGLang, we ensure that the draft models trained with SpecForge are directly compatible with [SGLang](https://github.com/sgl-project/sglang). This means that no postprocessing or weights conversion is required, providing users with a seamless experience from training to serving. We export our data in the Hugging Face format, so you can load it to other serving frameworks as well if the model is supported by them. diff --git a/idea1/docs/get_started/installation.md b/idea1/docs/get_started/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..e37169a11389c977667569e069436a348090b30f --- /dev/null +++ b/idea1/docs/get_started/installation.md @@ -0,0 +1,26 @@ +# 🚀 Get Started + +## 📦 Installation + +To install this project, you can simply run the following command. + +- **Install from source (recommended)** + +```bash +# git clone the source code +git clone https://github.com/sgl-project/SpecForge.git +cd SpecForge + +# create a new virtual environment +uv venv -p 3.11 +source .venv/bin/activate + +# install specforge +uv pip install -v . --prerelease=allow +``` + +- **Install from PyPI** + +```bash +pip install specforge +``` diff --git a/idea1/docs/index.rst b/idea1/docs/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..bc2c694798793eddd894f5bd94fde539b9fb06b8 --- /dev/null +++ b/idea1/docs/index.rst @@ -0,0 +1,53 @@ +SpecForge Documentation +======================= + +SpecForge is an ecosystem project developed by the SGLang team. It is a framework for training speculative decoding models so that you can smoothly port them over to the SGLang serving framework to speed up your inference. + + +.. toctree:: + :maxdepth: 1 + :caption: Get Started + + get_started/installation.md + get_started/about.md + +.. toctree:: + :maxdepth: 1 + :caption: Concepts + + concepts/speculative_decoding.md + concepts/EAGLE3.md + + +.. toctree:: + :maxdepth: 1 + :caption: Basic Usage + + basic_usage/data_preparation.md + basic_usage/training.md + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Features + + advanced_features/customization.md + +.. toctree:: + :maxdepth: 1 + :caption: Community Resources + + community_resources/specbundle.md + community_resources/dashboard.md + +.. toctree:: + :maxdepth: 1 + :caption: Examples + + examples/llama3-eagle3-online.md + examples/llama3-eagle3-offline.md + +.. toctree:: + :maxdepth: 1 + :caption: Benchmarks + + benchmarks/benchmark.md diff --git a/idea1/docs/requirements.txt b/idea1/docs/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a7e5d4eba2f265cb2dce4eff31d770eb71125f3 --- /dev/null +++ b/idea1/docs/requirements.txt @@ -0,0 +1,20 @@ +ipykernel +ipywidgets +jupyter_client +markdown>=3.4.0 +matplotlib +myst-parser +nbconvert +nbsphinx +pandoc +pillow +pydantic +sphinx +sphinx-book-theme +sphinx-copybutton +sphinx-tabs +nbstripout +sphinxcontrib-mermaid +urllib3<2.0.0 +gguf>=0.10.0 +sphinx-autobuild diff --git a/idea1/docs/serve.sh b/idea1/docs/serve.sh new file mode 100644 index 0000000000000000000000000000000000000000..049f767cf497a5fd92b1dac0af2fc13fdcf3fa69 --- /dev/null +++ b/idea1/docs/serve.sh @@ -0,0 +1,3 @@ +# Clean and serve documentation with auto-build +make clean +make serve diff --git a/idea1/docs/spec_bundle/index.html b/idea1/docs/spec_bundle/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ad336a93a9fd136ac55768562e96a1d8f324d001 --- /dev/null +++ b/idea1/docs/spec_bundle/index.html @@ -0,0 +1,21 @@ + + + + + + + + + + + SpecBundle + + + +
+ + + + diff --git a/idea1/docs/spec_bundle/package-lock.json b/idea1/docs/spec_bundle/package-lock.json new file mode 100644 index 0000000000000000000000000000000000000000..806b12c06b0402351fc4f8a900bc7303a86faa6a --- /dev/null +++ b/idea1/docs/spec_bundle/package-lock.json @@ -0,0 +1,1438 @@ +{ + "name": "specforge-spec-bundle", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "specforge-spec-bundle", + "version": "1.0.0", + "license": "MIT", + "dependencies": { + "echarts": "^6.0.0", + "papaparse": "^5.5.3", + "vue": "^3.5.24", + "vue-echarts": "^8.0.1", + "xlsx": "^0.18.5" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^6.0.1", + "vite": "^7.2.4" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmmirror.com/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmmirror.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.28.5", + "resolved": "https://registry.npmmirror.com/@babel/parser/-/parser-7.28.5.tgz", + "integrity": "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.5" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/types": { + "version": "7.28.5", + "resolved": "https://registry.npmmirror.com/@babel/types/-/types-7.28.5.tgz", + "integrity": "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz", + "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/android-arm/-/android-arm-0.25.12.tgz", + "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz", + "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/android-x64/-/android-x64-0.25.12.tgz", + "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz", + "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz", + "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz", + "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz", + "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz", + "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz", + "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz", + "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz", + "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz", + "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz", + "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz", + "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz", + "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz", + "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz", + "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz", + "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz", + "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz", + "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz", + "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz", + "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz", + "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz", + "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz", + "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmmirror.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "license": "MIT" + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.50", + "resolved": "https://registry.npmmirror.com/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.50.tgz", + "integrity": "sha512-5e76wQiQVeL1ICOZVUg4LSOVYg9jyhGCin+icYozhsUzM+fHE7kddi1bdiE0jwVqTfkjba3jUFbEkoC9WkdvyA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.53.3.tgz", + "integrity": "sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.53.3.tgz", + "integrity": "sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.53.3.tgz", + "integrity": "sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.53.3.tgz", + "integrity": "sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.53.3.tgz", + "integrity": "sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.53.3.tgz", + "integrity": "sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.53.3.tgz", + "integrity": "sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.53.3.tgz", + "integrity": "sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.53.3.tgz", + "integrity": "sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.53.3.tgz", + "integrity": "sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.53.3.tgz", + "integrity": "sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.53.3.tgz", + "integrity": "sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.53.3.tgz", + "integrity": "sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.53.3.tgz", + "integrity": "sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.53.3.tgz", + "integrity": "sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.53.3.tgz", + "integrity": "sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.53.3.tgz", + "integrity": "sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.53.3.tgz", + "integrity": "sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.53.3.tgz", + "integrity": "sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.53.3.tgz", + "integrity": "sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.53.3.tgz", + "integrity": "sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.53.3.tgz", + "integrity": "sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmmirror.com/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@vitejs/plugin-vue": { + "version": "6.0.2", + "resolved": "https://registry.npmmirror.com/@vitejs/plugin-vue/-/plugin-vue-6.0.2.tgz", + "integrity": "sha512-iHmwV3QcVGGvSC1BG5bZ4z6iwa1SOpAPWmnjOErd4Ske+lZua5K9TtAVdx0gMBClJ28DViCbSmZitjWZsWO3LA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@rolldown/pluginutils": "1.0.0-beta.50" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "peerDependencies": { + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0", + "vue": "^3.2.25" + } + }, + "node_modules/@vue/compiler-core": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-core/-/compiler-core-3.5.25.tgz", + "integrity": "sha512-vay5/oQJdsNHmliWoZfHPoVZZRmnSWhug0BYT34njkYTPqClh3DNWLkZNJBVSjsNMrg0CCrBfoKkjZQPM/QVUw==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@vue/shared": "3.5.25", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.1" + } + }, + "node_modules/@vue/compiler-dom": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-dom/-/compiler-dom-3.5.25.tgz", + "integrity": "sha512-4We0OAcMZsKgYoGlMjzYvaoErltdFI2/25wqanuTu+S4gismOTRTBPi4IASOjxWdzIwrYSjnqONfKvuqkXzE2Q==", + "license": "MIT", + "dependencies": { + "@vue/compiler-core": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-sfc/-/compiler-sfc-3.5.25.tgz", + "integrity": "sha512-PUgKp2rn8fFsI++lF2sO7gwO2d9Yj57Utr5yEsDf3GNaQcowCLKL7sf+LvVFvtJDXUp/03+dC6f2+LCv5aK1ag==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@vue/compiler-core": "3.5.25", + "@vue/compiler-dom": "3.5.25", + "@vue/compiler-ssr": "3.5.25", + "@vue/shared": "3.5.25", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.21", + "postcss": "^8.5.6", + "source-map-js": "^1.2.1" + } + }, + "node_modules/@vue/compiler-ssr": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/compiler-ssr/-/compiler-ssr-3.5.25.tgz", + "integrity": "sha512-ritPSKLBcParnsKYi+GNtbdbrIE1mtuFEJ4U1sWeuOMlIziK5GtOL85t5RhsNy4uWIXPgk+OUdpnXiTdzn8o3A==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/reactivity": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/reactivity/-/reactivity-3.5.25.tgz", + "integrity": "sha512-5xfAypCQepv4Jog1U4zn8cZIcbKKFka3AgWHEFQeK65OW+Ys4XybP6z2kKgws4YB43KGpqp5D/K3go2UPPunLA==", + "license": "MIT", + "dependencies": { + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/runtime-core": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/runtime-core/-/runtime-core-3.5.25.tgz", + "integrity": "sha512-Z751v203YWwYzy460bzsYQISDfPjHTl+6Zzwo/a3CsAf+0ccEjQ8c+0CdX1WsumRTHeywvyUFtW6KvNukT/smA==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/runtime-dom/-/runtime-dom-3.5.25.tgz", + "integrity": "sha512-a4WrkYFbb19i9pjkz38zJBg8wa/rboNERq3+hRRb0dHiJh13c+6kAbgqCPfMaJ2gg4weWD3APZswASOfmKwamA==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.25", + "@vue/runtime-core": "3.5.25", + "@vue/shared": "3.5.25", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/server-renderer": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/server-renderer/-/server-renderer-3.5.25.tgz", + "integrity": "sha512-UJaXR54vMG61i8XNIzTSf2Q7MOqZHpp8+x3XLGtE3+fL+nQd+k7O5+X3D/uWrnQXOdMw5VPih+Uremcw+u1woQ==", + "license": "MIT", + "dependencies": { + "@vue/compiler-ssr": "3.5.25", + "@vue/shared": "3.5.25" + }, + "peerDependencies": { + "vue": "3.5.25" + } + }, + "node_modules/@vue/shared": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/@vue/shared/-/shared-3.5.25.tgz", + "integrity": "sha512-AbOPdQQnAnzs58H2FrrDxYj/TJfmeS2jdfEEhgiKINy+bnOANmVizIEgq1r+C5zsbs6l1CCQxtcj71rwNQ4jWg==", + "license": "MIT" + }, + "node_modules/adler-32": { + "version": "1.3.1", + "resolved": "https://registry.npmmirror.com/adler-32/-/adler-32-1.3.1.tgz", + "integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/cfb": { + "version": "1.2.2", + "resolved": "https://registry.npmmirror.com/cfb/-/cfb-1.2.2.tgz", + "integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==", + "license": "Apache-2.0", + "dependencies": { + "adler-32": "~1.3.0", + "crc-32": "~1.2.0" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/codepage": { + "version": "1.15.0", + "resolved": "https://registry.npmmirror.com/codepage/-/codepage-1.15.0.tgz", + "integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/crc-32": { + "version": "1.2.2", + "resolved": "https://registry.npmmirror.com/crc-32/-/crc-32-1.2.2.tgz", + "integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==", + "license": "Apache-2.0", + "bin": { + "crc32": "bin/crc32.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmmirror.com/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "license": "MIT" + }, + "node_modules/echarts": { + "version": "6.0.0", + "resolved": "https://registry.npmmirror.com/echarts/-/echarts-6.0.0.tgz", + "integrity": "sha512-Tte/grDQRiETQP4xz3iZWSvoHrkCQtwqd6hs+mifXcjrCuo2iKWbajFObuLJVBlDIJlOzgQPd1hsaKt/3+OMkQ==", + "license": "Apache-2.0", + "dependencies": { + "tslib": "2.3.0", + "zrender": "6.0.0" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmmirror.com/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/esbuild": { + "version": "0.25.12", + "resolved": "https://registry.npmmirror.com/esbuild/-/esbuild-0.25.12.tgz", + "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.12", + "@esbuild/android-arm": "0.25.12", + "@esbuild/android-arm64": "0.25.12", + "@esbuild/android-x64": "0.25.12", + "@esbuild/darwin-arm64": "0.25.12", + "@esbuild/darwin-x64": "0.25.12", + "@esbuild/freebsd-arm64": "0.25.12", + "@esbuild/freebsd-x64": "0.25.12", + "@esbuild/linux-arm": "0.25.12", + "@esbuild/linux-arm64": "0.25.12", + "@esbuild/linux-ia32": "0.25.12", + "@esbuild/linux-loong64": "0.25.12", + "@esbuild/linux-mips64el": "0.25.12", + "@esbuild/linux-ppc64": "0.25.12", + "@esbuild/linux-riscv64": "0.25.12", + "@esbuild/linux-s390x": "0.25.12", + "@esbuild/linux-x64": "0.25.12", + "@esbuild/netbsd-arm64": "0.25.12", + "@esbuild/netbsd-x64": "0.25.12", + "@esbuild/openbsd-arm64": "0.25.12", + "@esbuild/openbsd-x64": "0.25.12", + "@esbuild/openharmony-arm64": "0.25.12", + "@esbuild/sunos-x64": "0.25.12", + "@esbuild/win32-arm64": "0.25.12", + "@esbuild/win32-ia32": "0.25.12", + "@esbuild/win32-x64": "0.25.12" + } + }, + "node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmmirror.com/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmmirror.com/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/frac": { + "version": "1.1.2", + "resolved": "https://registry.npmmirror.com/frac/-/frac-1.1.2.tgz", + "integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/magic-string": { + "version": "0.30.21", + "resolved": "https://registry.npmmirror.com/magic-string/-/magic-string-0.30.21.tgz", + "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.5" + } + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/papaparse": { + "version": "5.5.3", + "resolved": "https://registry.npmmirror.com/papaparse/-/papaparse-5.5.3.tgz", + "integrity": "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A==", + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmmirror.com/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "4.0.3", + "resolved": "https://registry.npmmirror.com/picomatch/-/picomatch-4.0.3.tgz", + "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/rollup": { + "version": "4.53.3", + "resolved": "https://registry.npmmirror.com/rollup/-/rollup-4.53.3.tgz", + "integrity": "sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.53.3", + "@rollup/rollup-android-arm64": "4.53.3", + "@rollup/rollup-darwin-arm64": "4.53.3", + "@rollup/rollup-darwin-x64": "4.53.3", + "@rollup/rollup-freebsd-arm64": "4.53.3", + "@rollup/rollup-freebsd-x64": "4.53.3", + "@rollup/rollup-linux-arm-gnueabihf": "4.53.3", + "@rollup/rollup-linux-arm-musleabihf": "4.53.3", + "@rollup/rollup-linux-arm64-gnu": "4.53.3", + "@rollup/rollup-linux-arm64-musl": "4.53.3", + "@rollup/rollup-linux-loong64-gnu": "4.53.3", + "@rollup/rollup-linux-ppc64-gnu": "4.53.3", + "@rollup/rollup-linux-riscv64-gnu": "4.53.3", + "@rollup/rollup-linux-riscv64-musl": "4.53.3", + "@rollup/rollup-linux-s390x-gnu": "4.53.3", + "@rollup/rollup-linux-x64-gnu": "4.53.3", + "@rollup/rollup-linux-x64-musl": "4.53.3", + "@rollup/rollup-openharmony-arm64": "4.53.3", + "@rollup/rollup-win32-arm64-msvc": "4.53.3", + "@rollup/rollup-win32-ia32-msvc": "4.53.3", + "@rollup/rollup-win32-x64-gnu": "4.53.3", + "@rollup/rollup-win32-x64-msvc": "4.53.3", + "fsevents": "~2.3.2" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/ssf": { + "version": "0.11.2", + "resolved": "https://registry.npmmirror.com/ssf/-/ssf-0.11.2.tgz", + "integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==", + "license": "Apache-2.0", + "dependencies": { + "frac": "~1.1.2" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/tinyglobby": { + "version": "0.2.15", + "resolved": "https://registry.npmmirror.com/tinyglobby/-/tinyglobby-0.2.15.tgz", + "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "fdir": "^6.5.0", + "picomatch": "^4.0.3" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/tslib": { + "version": "2.3.0", + "resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz", + "integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg==", + "license": "0BSD" + }, + "node_modules/vite": { + "version": "7.2.7", + "resolved": "https://registry.npmmirror.com/vite/-/vite-7.2.7.tgz", + "integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.5.0", + "picomatch": "^4.0.3", + "postcss": "^8.5.6", + "rollup": "^4.43.0", + "tinyglobby": "^0.2.15" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^20.19.0 || >=22.12.0", + "jiti": ">=1.21.0", + "less": "^4.0.0", + "lightningcss": "^1.21.0", + "sass": "^1.70.0", + "sass-embedded": "^1.70.0", + "stylus": ">=0.54.8", + "sugarss": "^5.0.0", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/vue": { + "version": "3.5.25", + "resolved": "https://registry.npmmirror.com/vue/-/vue-3.5.25.tgz", + "integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.25", + "@vue/compiler-sfc": "3.5.25", + "@vue/runtime-dom": "3.5.25", + "@vue/server-renderer": "3.5.25", + "@vue/shared": "3.5.25" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/vue-echarts": { + "version": "8.0.1", + "resolved": "https://registry.npmmirror.com/vue-echarts/-/vue-echarts-8.0.1.tgz", + "integrity": "sha512-23rJTFLu1OUEGRWjJGmdGt8fP+8+ja1gVgzMYPIPaHWpXegcO1viIAaeu2H4QHESlVeHzUAHIxKXGrwjsyXAaA==", + "license": "MIT", + "peerDependencies": { + "echarts": "^6.0.0", + "vue": "^3.3.0" + } + }, + "node_modules/wmf": { + "version": "1.0.2", + "resolved": "https://registry.npmmirror.com/wmf/-/wmf-1.0.2.tgz", + "integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/word": { + "version": "0.3.0", + "resolved": "https://registry.npmmirror.com/word/-/word-0.3.0.tgz", + "integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/xlsx": { + "version": "0.18.5", + "resolved": "https://registry.npmmirror.com/xlsx/-/xlsx-0.18.5.tgz", + "integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==", + "license": "Apache-2.0", + "dependencies": { + "adler-32": "~1.3.0", + "cfb": "~1.2.1", + "codepage": "~1.15.0", + "crc-32": "~1.2.1", + "ssf": "~0.11.2", + "wmf": "~1.0.1", + "word": "~0.3.0" + }, + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/zrender": { + "version": "6.0.0", + "resolved": "https://registry.npmmirror.com/zrender/-/zrender-6.0.0.tgz", + "integrity": "sha512-41dFXEEXuJpNecuUQq6JlbybmnHaqqpGlbH1yxnA5V9MMP4SbohSVZsJIwz+zdjQXSSlR1Vc34EgH1zxyTDvhg==", + "license": "BSD-3-Clause", + "dependencies": { + "tslib": "2.3.0" + } + } + } +} diff --git a/idea1/docs/spec_bundle/package.json b/idea1/docs/spec_bundle/package.json new file mode 100644 index 0000000000000000000000000000000000000000..d4657bbec62cbf2f39add1ccc35c9ce0c7eb412d --- /dev/null +++ b/idea1/docs/spec_bundle/package.json @@ -0,0 +1,40 @@ +{ + "name": "specforge-spec-bundle", + "private": false, + "version": "1.0.0", + "description": "Interactive SpecBundle visualization dashboard for SpecForge", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview", + "deploy": "npm run build && gh-pages -d dist" + }, + "repository": { + "type": "git", + "url": "https://github.com/sgl-project/SpecForge.git", + "directory": "docs/spec_bundle" + }, + "keywords": [ + "specforge", + "specbundle", + "benchmark", + "visualization", + "speculative-decoding", + "llm", + "performance" + ], + "author": "SpecForge Team", + "license": "MIT", + "dependencies": { + "echarts": "^6.0.0", + "papaparse": "^5.5.3", + "vue": "^3.5.24", + "vue-echarts": "^8.0.1", + "xlsx": "^0.18.5" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^6.0.1", + "vite": "^7.2.4" + } +} diff --git a/idea1/docs/spec_bundle/public/raw_data/data.json b/idea1/docs/spec_bundle/public/raw_data/data.json new file mode 100644 index 0000000000000000000000000000000000000000..f923184be9f11a0be51daa926e7b94ff1821007b --- /dev/null +++ b/idea1/docs/spec_bundle/public/raw_data/data.json @@ -0,0 +1,6422 @@ +{ + "Qwen3-30B-A3B-Instruct-2507": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1488.3645940190918, + "accept_length": 2.6400593352844486 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1499.6157892300257, + "accept_length": 3.0113471715954674 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1491.1759364152986, + "accept_length": 2.525104073618391 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1438.3989235515564, + "accept_length": 3.1488859094681736 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1071.2940027174511, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1478.3371126866896, + "accept_length": 2.515156901620291 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3022.302541558449, + "accept_length": 3.4018400160943374 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3458.7683757488517, + "accept_length": 4.5001277922609 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2710.0700446913434, + "accept_length": 3.83069810232181 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3636.1457092511932, + "accept_length": 5.29297884876688 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1468.9518188983302, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2650.9994915668844, + "accept_length": 3.981701201346221 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2048.689292397081, + "accept_length": 2.495847913511255 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2086.117426859236, + "accept_length": 2.831051301639537 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1698.4151046745978, + "accept_length": 2.5572219713355357 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1998.1600180425269, + "accept_length": 2.9819193324061195 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1341.3462205459145, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1742.9797705522778, + "accept_length": 2.7422317575874455 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2618.165602951494, + "accept_length": 3.349328692192939 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2912.1392571686956, + "accept_length": 4.384426363785289 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2367.016477367958, + "accept_length": 3.7901897758795298 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3069.9815866099266, + "accept_length": 5.124267515923567 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1366.6183006362219, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2363.3377665362655, + "accept_length": 4.030938739532834 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2911.405162351629, + "accept_length": 3.1783624121672447 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3265.2547245227543, + "accept_length": 4.018270197787462 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2455.0885550482017, + "accept_length": 3.295517305362425 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 3413.029275629196, + "accept_length": 4.576331556763159 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1492.6190597361915, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2355.0941391264764, + "accept_length": 3.3973067623684012 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1778.9653109324079, + "accept_length": 2.0810309937160505 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1778.6778684706662, + "accept_length": 2.2730321793789288 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1652.1607344416184, + "accept_length": 2.2703352879266276 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1682.9566856293293, + "accept_length": 2.3032779273841584 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1320.1266846132082, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1753.6698041448958, + "accept_length": 2.6092096546804138 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2237.792328921565, + "accept_length": 2.5958448251993995 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2341.298191039886, + "accept_length": 3.0077922694984913 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 1961.1700111065113, + "accept_length": 2.6947097860315505 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2310.2053834681674, + "accept_length": 3.216540452331778 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1410.428038868636, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge", + "output_throughput": 2008.7425535412629, + "accept_length": 2.91748293468006 + } + ] + } + ] + } + }, + "Qwen3-235B-A22B-Instruct-2507": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 633.4834448509783, + "accept_length": 2.356716526992789 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 718.620120234308, + "accept_length": 2.8762828246719394 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 619.3961515217887, + "accept_length": 2.5325967285309847 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 740.8090293617215, + "accept_length": 3.351527622767857 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 685.8224688133159, + "accept_length": 2.2254637464335056 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 718.5200251720828, + "accept_length": 2.5942242348162705 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 622.6877352310961, + "accept_length": 2.577754285484885 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 758.2839780669175, + "accept_length": 3.51144398279758 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 469.12940470010284, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 696.9862910262393, + "accept_length": 2.2957518385545184 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 692.54543613971, + "accept_length": 2.508131344520406 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 821.7716217768141, + "accept_length": 2.2131311175007076 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1165.3481778903413, + "accept_length": 3.2287879445239853 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 786.5291154131861, + "accept_length": 2.3811060693210626 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1263.6658286467714, + "accept_length": 4.021472447253628 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 729.1280796475185, + "accept_length": 2.1641727527768047 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1012.7228976076004, + "accept_length": 3.3166681444513406 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 801.9730196026575, + "accept_length": 2.4202165987905055 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1399.195876342606, + "accept_length": 4.477737029876627 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 587.3767625807179, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 728.5917394731794, + "accept_length": 2.180077789251727 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 966.5149174357106, + "accept_length": 3.0996346930308336 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 642.7287443329789, + "accept_length": 1.8722335837366109 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 814.539845630713, + "accept_length": 2.3454133346915906 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 617.9738942581079, + "accept_length": 1.9436368219822697 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 779.531140147999, + "accept_length": 2.571956737666924 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 579.7478777831109, + "accept_length": 1.879637550849381 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 684.112380410899, + "accept_length": 2.3538604252889965 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 607.3644823224199, + "accept_length": 1.9674055586107704 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 789.9679697718769, + "accept_length": 2.6698328935795956 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 529.8952857212083, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 596.0590450290033, + "accept_length": 1.987328547838102 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 670.0058040199536, + "accept_length": 2.329033512672587 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 866.1813723921825, + "accept_length": 2.533027363039563 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1068.373749600453, + "accept_length": 3.238804311590177 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 853.4917713020631, + "accept_length": 2.8369721532226433 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1176.5192650014792, + "accept_length": 4.083723300745958 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 772.1684975661775, + "accept_length": 2.5123042505592843 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1032.477913431608, + "accept_length": 3.6360244115082825 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 889.8951303902317, + "accept_length": 2.955997016746898 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1267.5178598410528, + "accept_length": 4.4874762125186445 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 553.0503522362385, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 736.1010265214783, + "accept_length": 2.3861131594156686 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 983.9906558013464, + "accept_length": 3.412326127536581 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 803.7805606947842, + "accept_length": 2.090690935434212 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1062.9796952555507, + "accept_length": 2.8172381425652917 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 759.6333115912107, + "accept_length": 2.2179516111790765 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1093.1979234549972, + "accept_length": 3.268498808394456 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 708.4447966909656, + "accept_length": 2.077364507787014 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 874.062642276262, + "accept_length": 2.6670587896561795 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 767.8685797664081, + "accept_length": 2.2474642743536366 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 1155.6572987907093, + "accept_length": 3.490068495285106 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 598.1832041732818, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 711.4663371023372, + "accept_length": 2.129619842542645 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 835.6105646149398, + "accept_length": 2.590646146520392 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 689.4282413740445, + "accept_length": 1.941237358311274 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 872.4508905377182, + "accept_length": 2.556773924332344 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 636.4408069963314, + "accept_length": 2.027268079304664 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 885.529748337286, + "accept_length": 2.8442245393804413 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 642.4958901994291, + "accept_length": 2.0553746448296777 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 730.7331843587357, + "accept_length": 2.4330876223070512 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 641.1037073226237, + "accept_length": 2.0361251069493296 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 889.0304393086461, + "accept_length": 2.965008914078923 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 539.5161023038148, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 654.3422430101997, + "accept_length": 2.1356956699218137 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 742.3749721046132, + "accept_length": 2.5176210584474528 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 716.6967887897075, + "accept_length": 2.0240035915598344 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 823.4218898853592, + "accept_length": 2.356617214868455 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 680.2044274358036, + "accept_length": 2.14011469258975 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 808.934577824737, + "accept_length": 2.6032639643837037 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 630.9312870281678, + "accept_length": 1.9776516235921864 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 698.9315763256182, + "accept_length": 2.2587729126518172 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 685.8554308455039, + "accept_length": 2.1591340093176212 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 826.5168292170538, + "accept_length": 2.6672259363465063 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 563.1619467852893, + "accept_length": 1.0 + }, + { + "Name": "lmsys/Qwen3-235B-A22B-EAGLE3", + "output_throughput": 636.0480501999019, + "accept_length": 2.001480647431386 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge", + "output_throughput": 683.7427107159214, + "accept_length": 2.241436629482574 + } + ] + } + ] + } + }, + "Qwen3-Next-80B-A3B-Instruct-FP8": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 683.8795985073891, + "accept_length": 3.13391215089175 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 753.237074543623, + "accept_length": 3.9038018228889597 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 746.7222279174218, + "accept_length": 4.022678679117706 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 771.153101164556, + "accept_length": 4.345554699994077 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 549.6362180919164, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 773.4012327870145, + "accept_length": 4.607604467310829 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1478.3001038430784, + "accept_length": 3.498551418454351 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1764.2064514729698, + "accept_length": 4.677160426045899 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1758.0166003158934, + "accept_length": 4.755809947207558 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1912.6838622508392, + "accept_length": 5.554967332076544 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 863.7773324206034, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1853.434631732593, + "accept_length": 5.756492370623537 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1095.5102974622082, + "accept_length": 2.581125058112506 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1157.636689246293, + "accept_length": 2.9156972910237133 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1197.112468072539, + "accept_length": 3.1331585165547646 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1127.4364940073876, + "accept_length": 3.0475279197966354 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 803.4970369348379, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1198.9417562126052, + "accept_length": 3.4190589216409535 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1245.6702060145312, + "accept_length": 3.4647713687985653 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1527.7120587214345, + "accept_length": 4.612265133111893 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1536.7723048769212, + "accept_length": 4.676180904522613 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1628.1293604862747, + "accept_length": 5.4577785667790994 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 788.4509521573036, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1629.7244930267507, + "accept_length": 5.621873496873497 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1463.1234977160723, + "accept_length": 3.1058026902179443 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1724.2207417984275, + "accept_length": 3.8462516284893944 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1734.4894352951553, + "accept_length": 3.9821418050654955 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1786.8774464735384, + "accept_length": 4.2761952310299485 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 916.0337036761792, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1829.5532782765572, + "accept_length": 4.590307145700787 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 986.4282909200625, + "accept_length": 2.0752097090844193 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 981.0983772859984, + "accept_length": 2.1801329261720857 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1057.6549922432027, + "accept_length": 2.439575219817722 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 956.6098887389447, + "accept_length": 2.2457481515800852 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 827.3050477430119, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1041.5277102267419, + "accept_length": 2.606484877248997 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1368.9499756838852, + "accept_length": 2.7362548025140208 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1457.9918429280988, + "accept_length": 3.1803662497541225 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1511.274616068283, + "accept_length": 3.3682366894832594 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1463.9444559000415, + "accept_length": 3.380290412894046 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 909.8620481543201, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Next-80B-A3B-Instruct-FP8-SpecForge", + "output_throughput": 1541.4580844550508, + "accept_length": 3.7385501251645787 + } + ] + } + ] + } + }, + "Qwen3-Coder-30B-A3B-Instruct": { + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2621.7139434700584, + "accept_length": 3.394971072541166 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2966.4459091363574, + "accept_length": 4.5011526953450725 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2236.868611380527, + "accept_length": 3.9489230027326796 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 3205.2025971977832, + "accept_length": 5.306789266712931 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1296.1854608851213, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2553.012134540716, + "accept_length": 4.221071958746777 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2992.02067556649, + "accept_length": 3.138553878632709 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 3328.9058789398114, + "accept_length": 3.9449129401751835 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2541.3931549111803, + "accept_length": 3.336379596827288 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 3472.3919294148427, + "accept_length": 4.477776008915068 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 1506.2936922288973, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-30B-A3B-Instruct-SpecForge", + "output_throughput": 2552.5518885328293, + "accept_length": 3.5865930607956185 + } + ] + } + ] + } + }, + "Qwen3-Coder-480B-A35B-Instruct": { + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 867.5261370310272, + "accept_length": 3.4954686382065345 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 1044.4475556194586, + "accept_length": 4.68614810868407 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 945.2207076385645, + "accept_length": 4.2835241878943675 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 1165.0727231905212, + "accept_length": 5.626203379024545 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 470.6571664751315, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 956.5336674844815, + "accept_length": 4.574128043621322 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 846.6405796214389, + "accept_length": 3.0936425388083757 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 946.3806786937351, + "accept_length": 3.8547162126548313 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 817.5432981932123, + "accept_length": 3.3539182909649066 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 983.2554936551461, + "accept_length": 4.260473117512835 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.99996954994094, + "accept_length": 1.0 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Qwen3-Coder-480B-A35B-Instruct-SpecForge-EigenAI", + "output_throughput": 790.2818911646486, + "accept_length": 3.379611891844464 + } + ] + } + ] + } + }, + "Kimi-K2-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 498.355967400969, + "accept_length": 3.271389121751566 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 538.7660861191819, + "accept_length": 4.120435815920245 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 476.5166831456105, + "accept_length": 3.5748305647840533 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 544.16588655688, + "accept_length": 4.655279611582661 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 337.92445122816076, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 459.1757114935756, + "accept_length": 3.4419677544677545 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 877.2113745892083, + "accept_length": 3.46806357521281 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 995.8769550545389, + "accept_length": 4.610169876195772 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 772.6100737625807, + "accept_length": 3.527844083399639 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 1022.7285831443611, + "accept_length": 5.383128673454291 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 492.06079685961566, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 649.083231514055, + "accept_length": 3.1435862587473253 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 533.8166177911393, + "accept_length": 2.3897198230461343 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 526.1187611377575, + "accept_length": 2.738876732312181 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 473.3129895327435, + "accept_length": 2.394141207153502 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 488.46384825810924, + "accept_length": 2.7821796546219706 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 430.9240376244664, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 451.126180366313, + "accept_length": 2.536454493323503 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 779.7838793636296, + "accept_length": 3.364936827816644 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 868.550857852841, + "accept_length": 4.423030465709301 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 729.1217213710999, + "accept_length": 3.7321711568938194 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 897.9039799990946, + "accept_length": 5.162398550153652 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 466.0584238730984, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 669.271164663664, + "accept_length": 3.7044178210408085 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 841.5023790421864, + "accept_length": 3.162685632492396 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 904.3910288246204, + "accept_length": 3.943605886942718 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 716.7319007181034, + "accept_length": 3.1374681580049573 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 896.7006322822839, + "accept_length": 4.400262176061309 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 500.12137141510016, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 650.4333056536461, + "accept_length": 3.0780193205478037 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 647.3644717982133, + "accept_length": 2.9848269628099175 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 660.0254297132984, + "accept_length": 3.594056395834917 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 523.0340443308603, + "accept_length": 2.8796471741261027 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 630.5425124127137, + "accept_length": 3.944647875329984 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 433.44658979995484, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 389.47080223360666, + "accept_length": 2.5096594789735582 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 783.436424568974, + "accept_length": 2.904452196823693 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 811.3642458480507, + "accept_length": 3.4622853609057755 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 699.8111934038128, + "accept_length": 3.0198274205132876 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 770.4892578818251, + "accept_length": 3.6995331477421103 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 505.3742994094499, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Kimi-K2-Instruct-eagle3", + "output_throughput": 596.3162033813331, + "accept_length": 2.7901899604967983 + } + ] + } + ] + } + }, + "Ling-flash-2.0": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1144.7606179148752, + "accept_length": 3.4351661916604646 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1253.4000030615975, + "accept_length": 4.487906489549112 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1059.7381115819003, + "accept_length": 3.331830155824441 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1323.0093663978187, + "accept_length": 5.148644964283767 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 674.3464018618124, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1026.8025294413142, + "accept_length": 3.126593214481735 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1434.6065070935829, + "accept_length": 3.4340471141971713 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1607.3212268988339, + "accept_length": 4.493397164127635 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1383.6720582197756, + "accept_length": 3.7931376508179415 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1685.5692612687462, + "accept_length": 5.218245374511558 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 762.7113399535667, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1330.1086623703009, + "accept_length": 3.793696144088135 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1022.5890920470158, + "accept_length": 2.392568385378843 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 990.0430932236113, + "accept_length": 2.648161574313827 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 914.3899001110539, + "accept_length": 2.5161251562049407 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 942.3914903299366, + "accept_length": 2.771332137960131 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 728.5278345617202, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 968.0479918450316, + "accept_length": 2.8558805412179527 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1271.2889448808319, + "accept_length": 3.1471241394625804 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1353.1437889143726, + "accept_length": 3.9318483282257697 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1175.4192382338058, + "accept_length": 3.29687986547923 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1358.9726439538854, + "accept_length": 4.370163501574083 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 740.2477168580639, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1141.7913416362687, + "accept_length": 3.3590013964490297 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1305.1833791876973, + "accept_length": 2.9790301516097895 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1366.417326281792, + "accept_length": 3.6103649876590875 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1130.7868943433502, + "accept_length": 2.8933133857317164 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1345.6741018953574, + "accept_length": 3.9330923185867093 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 770.3957537752161, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1061.6897228931932, + "accept_length": 2.902182106883942 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 863.8565336005082, + "accept_length": 1.907102314310342 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 833.1235940586521, + "accept_length": 2.047546254809973 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 798.9811798480557, + "accept_length": 1.9372590117256243 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 763.2761511276084, + "accept_length": 2.0470985454359427 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 747.7098566179897, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 779.3060665006524, + "accept_length": 2.045476819601249 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1185.7250147683403, + "accept_length": 2.562389392369937 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1161.8732670284553, + "accept_length": 2.886871902842324 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1052.640023467198, + "accept_length": 2.6017604302340236 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1111.996259596397, + "accept_length": 3.0648124985786733 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 794.1289733679167, + "accept_length": 1.0 + }, + { + "Name": "AQ-MedAI/Ling-Flash-2.0-eagle3", + "output_throughput": 1004.4992021266573, + "accept_length": 2.6709053367549105 + } + ] + } + ] + } + }, + "Llama-3.1-8B-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 228.64232714994796, + "accept_length": 1.7165139181419709 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 321.2528041157779, + "accept_length": 2.5481878001819607 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 213.550264904667, + "accept_length": 1.7634936642258956 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 329.6873220645443, + "accept_length": 2.8537845395516377 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 195.13619448514442, + "accept_length": 1.7528912619638426 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 251.43922505539766, + "accept_length": 2.2820562939796716 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 197.901650893672, + "accept_length": 1.7742552127753433 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 317.61058794222197, + "accept_length": 2.9733251079580505 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 181.81151788749455, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 182.0257072155964, + "accept_length": 1.789228234172427 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 240.85801894998306, + "accept_length": 2.367398432594591 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 399.2995452070592, + "accept_length": 2.7825411590459592 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 492.28246574028134, + "accept_length": 3.4786948176583494 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 422.40466722576286, + "accept_length": 3.254684892147128 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 594.5033645961273, + "accept_length": 4.624857400180126 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 387.0489467031037, + "accept_length": 3.3070174292508296 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 480.43534296060534, + "accept_length": 4.116159164796923 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 413.57783551553456, + "accept_length": 3.489213277012106 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 638.0439777096752, + "accept_length": 5.402844266750837 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 191.04076784280642, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 326.8790406711244, + "accept_length": 3.072066504990206 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 453.306808098541, + "accept_length": 4.25573095185686 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 414.90616666264776, + "accept_length": 2.930670028119849 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 404.24667749722187, + "accept_length": 2.8980726819445777 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 453.73692243041774, + "accept_length": 3.554148008484563 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 446.6366476858434, + "accept_length": 3.5164393144456105 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 338.6308027570883, + "accept_length": 2.9393909722902185 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 346.46724606666106, + "accept_length": 3.0061221366256823 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 454.730035166582, + "accept_length": 3.906676145543851 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 450.03198538047087, + "accept_length": 3.855839765261211 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.98120707576373, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 305.1648971387325, + "accept_length": 2.9089536379397125 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 308.00561770283963, + "accept_length": 2.938163437236731 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 432.8677712430711, + "accept_length": 3.0469174293472796 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 465.1765542307934, + "accept_length": 3.3398192040568846 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 479.1212006261437, + "accept_length": 3.7445769729930163 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 548.9370103875078, + "accept_length": 4.318366474235621 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 340.2704451839945, + "accept_length": 2.9425913908717285 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 377.47349118830954, + "accept_length": 3.2519286521546853 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 480.3152659024827, + "accept_length": 4.0959237477185155 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 571.4886457684788, + "accept_length": 4.910129659643436 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.91017930680567, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 311.1051926955927, + "accept_length": 2.9338537387017256 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 330.15665770360005, + "accept_length": 3.126203604641593 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 380.6915537026263, + "accept_length": 2.6893540748536475 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 439.67672671912396, + "accept_length": 3.16861704188786 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 398.3738662742165, + "accept_length": 3.1199565043209523 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 506.22686693578754, + "accept_length": 3.9957244075250427 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 322.29847741557273, + "accept_length": 2.771756050751679 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 375.34956052924895, + "accept_length": 3.236171472299629 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 391.25705242634194, + "accept_length": 3.334862665932587 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 516.904537338255, + "accept_length": 4.466856034741759 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 189.70410640395912, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 287.68205157705233, + "accept_length": 2.7148899046029547 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 378.8468257829908, + "accept_length": 3.585376494197714 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 237.18050733350836, + "accept_length": 1.713236561734993 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 258.6437346257605, + "accept_length": 1.9050339301460721 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 226.67848476067016, + "accept_length": 1.8075300109130592 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 254.48969338840087, + "accept_length": 2.043805528134255 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 210.94791438286492, + "accept_length": 1.8654798891594593 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 251.07710462288492, + "accept_length": 2.2264818220398923 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 211.18454065719607, + "accept_length": 1.8434056761268782 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 240.6034453504167, + "accept_length": 2.1029710512950737 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 185.6534194378935, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 183.72672690273865, + "accept_length": 1.7817737292479987 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 229.82170237350869, + "accept_length": 2.250341575212658 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 1, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 409.86415544506445, + "accept_length": 2.8552892726009724 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 442.54523731909666, + "accept_length": 3.135712400558006 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 438.0519648397228, + "accept_length": 3.3792158666871135 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 507.1290934019136, + "accept_length": 3.936040126357265 + } + ] + }, + { + "batch_size": 1, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 352.1689105895484, + "accept_length": 3.026258098612226 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 413.1686528229548, + "accept_length": 3.5475168823860437 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 434.1788724748705, + "accept_length": 3.6819800875461333 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 514.2312383540044, + "accept_length": 4.357665531437638 + } + ] + }, + { + "batch_size": 1, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 190.4500188461883, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B", + "output_throughput": 311.5910755177637, + "accept_length": 2.9283727399165507 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge", + "output_throughput": 390.64506651929287, + "accept_length": 3.692280754414928 + } + ] + } + ] + } + }, + "Llama-3.3-70B-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 521.4502791575164, + "accept_length": 1.2760798037239203 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Spec for ge", + "output_throughput": 837.9426300003847, + "accept_length": 2.3179247901200304 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 500.5534332009228, + "accept_length": 1.2836005168205962 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 855.6400225608106, + "accept_length": 2.4851382017038057 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 500.33326156436937, + "accept_length": 1.3482255389718076 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 758.9001336688345, + "accept_length": 2.12511673151751 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 483.12653680688, + "accept_length": 1.2856745693167546 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 820.5175400063332, + "accept_length": 2.516910489405022 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 453.2156138501392, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 480.4218686725539, + "accept_length": 1.3936331604189096 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 739.405741336959, + "accept_length": 2.222061210294459 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1088.844896763402, + "accept_length": 2.3720131878590123 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1273.7733416283656, + "accept_length": 2.841736535013628 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1122.2476729474943, + "accept_length": 2.5920045204124875 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1382.9357431087456, + "accept_length": 3.243898689873717 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1112.8479569335152, + "accept_length": 2.792588962605549 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1274.2110431983278, + "accept_length": 3.2416170775479363 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1079.9951811356827, + "accept_length": 2.6718376973892366 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1327.6044700788502, + "accept_length": 3.3766338373668217 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 567.3739460148672, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1090.3170854344964, + "accept_length": 2.966812280063099 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1215.8347875575441, + "accept_length": 3.3641021480547684 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1234.647877556777, + "accept_length": 2.9232673267326734 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1238.4736758319698, + "accept_length": 2.9606951984177083 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1377.8052334866013, + "accept_length": 3.5324281309061973 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1409.5100765643524, + "accept_length": 3.6175162329362442 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1129.6661036217977, + "accept_length": 3.143848893296669 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1108.3072501756835, + "accept_length": 3.2248797608215263 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1425.2993761886291, + "accept_length": 3.8789368991048736 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1440.3671955624673, + "accept_length": 3.97791186891054 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 540.4640557255416, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1069.4986663607351, + "accept_length": 3.1943331425300516 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1033.773238205561, + "accept_length": 3.2422141262192974 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1194.0875984832494, + "accept_length": 2.6663626344392504 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1290.1122375104421, + "accept_length": 2.925804965875309 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1282.7936401185236, + "accept_length": 3.0671719811813904 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1426.372333907719, + "accept_length": 3.436568804650481 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1090.1088508973057, + "accept_length": 2.8127895941495002 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1174.0867819009864, + "accept_length": 3.0611013660766493 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1267.8737053510965, + "accept_length": 3.1906793120660706 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1407.8140138598972, + "accept_length": 3.6735002608242047 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.9500728009846, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1013.2705272855593, + "accept_length": 2.7776112847805305 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 968.2027451202639, + "accept_length": 2.742653690956563 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1210.6010917932015, + "accept_length": 2.723797958423008 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1295.014267720614, + "accept_length": 2.952023346303502 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1303.4195570335166, + "accept_length": 3.133414966360772 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1423.2736941362525, + "accept_length": 3.4980468448438247 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1070.711661408102, + "accept_length": 2.735034762087001 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1154.785652335772, + "accept_length": 2.9811645516106386 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1279.5345355421975, + "accept_length": 3.284394784770605 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1399.3991191944933, + "accept_length": 3.716324359708698 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 560.8834615148919, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 1013.3765756840332, + "accept_length": 2.773990564681233 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1035.4140338795994, + "accept_length": 2.933293078243183 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 704.0737829344649, + "accept_length": 1.645732050137249 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 936.4940018423655, + "accept_length": 2.2541347317466722 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 684.0195321200449, + "accept_length": 1.702027072988232 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 933.0572305312112, + "accept_length": 2.39442380929992 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 618.4946534541955, + "accept_length": 1.7860533893688224 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 700.886442439991, + "accept_length": 2.281622206910129 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 652.1412786559076, + "accept_length": 1.7116903633491312 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 887.7001871678323, + "accept_length": 2.452738257649581 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 512.5751663875466, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 635.2599880909434, + "accept_length": 1.9610333607746286 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 854.0347909075315, + "accept_length": 2.589833798374378 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 962.5545831639148, + "accept_length": 2.0451300999292217 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1020.0538308626681, + "accept_length": 2.1911976817371235 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 963.8356757692138, + "accept_length": 2.1687507495755036 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 1039.643962895085, + "accept_length": 2.3552079123829617 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 890.1003387342033, + "accept_length": 2.226321240698847 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 960.5616523564485, + "accept_length": 2.4811411267352264 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 916.6826693888017, + "accept_length": 2.1849745643049188 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 984.4877550429275, + "accept_length": 2.4152394292465176 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 575.6879373469175, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-LLaMA3.3-Instruct-70B", + "output_throughput": 838.0962787179271, + "accept_length": 2.3145643059121785 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-Specforge", + "output_throughput": 924.0808096194634, + "accept_length": 2.573260793115575 + } + ] + } + ] + } + }, + "Llama-4-Scout-17B-16E-Instruct": { + "gsm8k": { + "benchmark_name": "gsm8k", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 816.6176343207234, + "accept_length": 2.435108707729916 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 908.8655650704263, + "accept_length": 3.1118742007294085 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 806.5328373116205, + "accept_length": 2.6234459324405357 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 971.8534490877095, + "accept_length": 3.8715801886792454 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 708.8133468064259, + "accept_length": 2.146746247607535 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 818.3072714693558, + "accept_length": 2.918526679710503 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 765.9810114809961, + "accept_length": 2.675257522087863 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 957.227019602509, + "accept_length": 4.307217442700466 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 455.9311905316165, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 675.0775309782273, + "accept_length": 2.144316290813106 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 814.5839518607636, + "accept_length": 2.627502101582583 + } + ] + } + ] + }, + "math500": { + "benchmark_name": "math500", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1478.9989946720648, + "accept_length": 2.366719134681358 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1884.3462895109676, + "accept_length": 3.238557789111507 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1447.5513200323323, + "accept_length": 2.5898901840327406 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2100.7682204066577, + "accept_length": 4.153214423200308 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1199.1485073659853, + "accept_length": 2.489558557182447 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1457.2169829849418, + "accept_length": 3.2046972238757507 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1330.0337890073868, + "accept_length": 2.648556845221877 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2110.3314050998847, + "accept_length": 4.7805795395081105 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 561.835811548351, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1153.7706965189202, + "accept_length": 2.6314392278632304 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1369.6607164745208, + "accept_length": 3.2076523352436657 + } + ] + } + ] + }, + "mtbench": { + "benchmark_name": "mtbench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1252.9681990096112, + "accept_length": 2.3541095408844828 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1302.3829223511154, + "accept_length": 2.4913843888070693 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1225.4607594389363, + "accept_length": 2.5648559607722956 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1312.399917450856, + "accept_length": 2.836414637256152 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 953.148992300308, + "accept_length": 2.222710749523974 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 967.1281111811169, + "accept_length": 2.3256101583113455 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1157.0433602013916, + "accept_length": 2.649528603387664 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1276.9552963643773, + "accept_length": 3.0189181867437243 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 502.10114738381606, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 940.9893388280037, + "accept_length": 2.3959043407227965 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1010.4098410869198, + "accept_length": 2.7008052625609618 + } + ] + } + ] + }, + "humaneval": { + "benchmark_name": "humaneval", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1515.800628974162, + "accept_length": 2.664927494512612 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1749.0012751674196, + "accept_length": 3.224152798137449 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1556.515161340629, + "accept_length": 3.085438335809807 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1921.2922045342316, + "accept_length": 4.140846637369973 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1201.849883743592, + "accept_length": 2.6006220481511346 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1393.1592557980014, + "accept_length": 3.1744799971652315 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1456.346786965349, + "accept_length": 3.2582381225462083 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1944.8214954525663, + "accept_length": 4.7947306331104995 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 631.8746804703884, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1109.058302621911, + "accept_length": 2.6508010386556267 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1234.7042057027743, + "accept_length": 3.0442784990549376 + } + ] + } + ] + }, + "livecodebench": { + "benchmark_name": "livecodebench", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1598.2921930690502, + "accept_length": 2.487202280374381 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1933.9962764283844, + "accept_length": 3.14740116583215 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1601.2688464385185, + "accept_length": 2.8043640587405627 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2144.3319751584095, + "accept_length": 3.983057732747085 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1051.7266219288254, + "accept_length": 2.1138485934104656 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1320.656674087923, + "accept_length": 2.7145795398417976 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1501.558947290443, + "accept_length": 2.929916684169992 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 2170.188140733029, + "accept_length": 4.55060712303548 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 484.2501137181978, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1009.5574686537159, + "accept_length": 2.2590065740745002 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1249.8114756626915, + "accept_length": 2.8130523194007555 + } + ] + } + ] + }, + "financeqa": { + "benchmark_name": "financeqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1022.713052476267, + "accept_length": 1.7952034022379475 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1189.61672405822, + "accept_length": 2.2164571332464367 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 963.8209003406079, + "accept_length": 1.8240590609583607 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1171.8275957081507, + "accept_length": 2.408275220827522 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 755.8055387643059, + "accept_length": 1.780077619663648 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 887.65933899505, + "accept_length": 2.1907344347752975 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 885.0003924094965, + "accept_length": 1.864155494076754 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1084.5573704005851, + "accept_length": 2.459442783236034 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 288.9007335547823, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 773.7660016870891, + "accept_length": 2.05643096671835 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 838.3207906571789, + "accept_length": 2.1910908349096845 + } + ] + } + ] + }, + "gpqa": { + "benchmark_name": "gpqa", + "results": [ + { + "batch_size": 8, + "steps": 3, + "topk": 1, + "num_draft_tokens": 4, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1320.0198779778916, + "accept_length": 2.0166714112874526 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1482.2781495871964, + "accept_length": 2.3200242800296755 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 1, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1258.0775283103167, + "accept_length": 2.135039169677331 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1468.3432054658438, + "accept_length": 2.5528455284552845 + } + ] + }, + { + "batch_size": 8, + "steps": 5, + "topk": 3, + "num_draft_tokens": 6, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1405.110892125768, + "accept_length": 2.8834021014937705 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1502.213627081269, + "accept_length": 3.0623772161357583 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 1, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1148.5409144989237, + "accept_length": 2.1684843736177633 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1379.1223204247422, + "accept_length": 2.672381928590287 + } + ] + }, + { + "batch_size": 8, + "steps": 7, + "topk": 4, + "num_draft_tokens": 8, + "metrics": [ + { + "Name": "Wihtout EAGLE3", + "output_throughput": 541.0010469896803, + "accept_length": 1.0 + }, + { + "Name": "lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1", + "output_throughput": 1345.7377508882935, + "accept_length": 3.044341630328194 + }, + { + "Name": "lmsys/SGLang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-SpecForge", + "output_throughput": 1474.1967930541948, + "accept_length": 3.315005686664771 + } + ] + } + ] + } + } +} diff --git a/idea1/docs/spec_bundle/public/vite.svg b/idea1/docs/spec_bundle/public/vite.svg new file mode 100644 index 0000000000000000000000000000000000000000..ee9fadaf9c4a762ac0ec010ca16ce8fa39a09e56 --- /dev/null +++ b/idea1/docs/spec_bundle/public/vite.svg @@ -0,0 +1 @@ + diff --git a/idea1/docs/spec_bundle/src/App.vue b/idea1/docs/spec_bundle/src/App.vue new file mode 100644 index 0000000000000000000000000000000000000000..9dcb55de3bd93c89fa2d56f202e66a7dce8f7bb3 --- /dev/null +++ b/idea1/docs/spec_bundle/src/App.vue @@ -0,0 +1,17 @@ + + + + + diff --git a/idea1/docs/spec_bundle/src/components/BenchmarkChart.vue b/idea1/docs/spec_bundle/src/components/BenchmarkChart.vue new file mode 100644 index 0000000000000000000000000000000000000000..c8022be25e3da8ad72760690c177075d12ec607c --- /dev/null +++ b/idea1/docs/spec_bundle/src/components/BenchmarkChart.vue @@ -0,0 +1,289 @@ + + + + + diff --git a/idea1/docs/spec_bundle/src/components/BenchmarkDashboard.vue b/idea1/docs/spec_bundle/src/components/BenchmarkDashboard.vue new file mode 100644 index 0000000000000000000000000000000000000000..a5d33cc912211e9311f78422b0436a3e605a0dbd --- /dev/null +++ b/idea1/docs/spec_bundle/src/components/BenchmarkDashboard.vue @@ -0,0 +1,601 @@ + + + + + diff --git a/idea1/docs/spec_bundle/src/components/BenchmarkTable.vue b/idea1/docs/spec_bundle/src/components/BenchmarkTable.vue new file mode 100644 index 0000000000000000000000000000000000000000..69bd280e025b1158882f77828e850959a51e3876 --- /dev/null +++ b/idea1/docs/spec_bundle/src/components/BenchmarkTable.vue @@ -0,0 +1,364 @@ + + + + + diff --git a/idea1/docs/spec_bundle/src/components/FilterControls.vue b/idea1/docs/spec_bundle/src/components/FilterControls.vue new file mode 100644 index 0000000000000000000000000000000000000000..d5b2ff6a43807e872b1c6e316e0f420ad526fa19 --- /dev/null +++ b/idea1/docs/spec_bundle/src/components/FilterControls.vue @@ -0,0 +1,189 @@ + + + + + diff --git a/idea1/docs/spec_bundle/src/components/HelloWorld.vue b/idea1/docs/spec_bundle/src/components/HelloWorld.vue new file mode 100644 index 0000000000000000000000000000000000000000..546ebbc624b0e3baf58efc6a8dd149ac5e6074e6 --- /dev/null +++ b/idea1/docs/spec_bundle/src/components/HelloWorld.vue @@ -0,0 +1,43 @@ + + + + + diff --git a/idea1/docs/spec_bundle/src/main.js b/idea1/docs/spec_bundle/src/main.js new file mode 100644 index 0000000000000000000000000000000000000000..2425c0f745bef4d009cb6661b62fd9dfd62960b0 --- /dev/null +++ b/idea1/docs/spec_bundle/src/main.js @@ -0,0 +1,5 @@ +import { createApp } from 'vue' +import './style.css' +import App from './App.vue' + +createApp(App).mount('#app') diff --git a/idea1/docs/spec_bundle/src/style.css b/idea1/docs/spec_bundle/src/style.css new file mode 100644 index 0000000000000000000000000000000000000000..7d3583b0d14cd41cf4c13824b049e80b7e16f17b --- /dev/null +++ b/idea1/docs/spec_bundle/src/style.css @@ -0,0 +1,82 @@ +:root { + --font-sans: 'Inter', system-ui, -apple-system, sans-serif; + --font-display: 'Outfit', system-ui, -apple-system, sans-serif; + + --color-primary: #4F46E5; + /* Indigo 600 */ + --color-primary-dark: #4338CA; + --color-primary-light: #818CF8; + + --color-background: #F8FAFC; + /* Slate 50 */ + --color-surface: #FFFFFF; + + --color-text-main: #0F172A; + /* Slate 900 */ + --color-text-secondary: #64748B; + /* Slate 500 */ + --color-text-muted: #94A3B8; + /* Slate 400 */ + + --color-success: #10B981; + --color-warning: #F59E0B; + --color-danger: #EF4444; + + /* Premium Shadows - Softer and tinted */ + --shadow-sm: 0 1px 2px 0 rgba(15, 23, 42, 0.05); + --shadow-md: 0 4px 6px -1px rgba(15, 23, 42, 0.05), 0 2px 4px -2px rgba(15, 23, 42, 0.05); + --shadow-lg: 0 10px 15px -3px rgba(15, 23, 42, 0.05), 0 4px 6px -4px rgba(15, 23, 42, 0.04); + --shadow-xl: 0 20px 25px -5px rgba(15, 23, 42, 0.05), 0 8px 10px -6px rgba(15, 23, 42, 0.04); + + /* Feature Shadow - Diffuse Glow */ + --shadow-glow: 0 0 40px -10px rgba(79, 70, 229, 0.15); + + /* Tighter, more technical radii */ + --radius-lg: 8px; + --radius-xl: 12px; + --radius-2xl: 16px; +} + +* { + box-sizing: border-box; + margin: 0; + padding: 0; +} + +body { + font-family: var(--font-sans); + background-color: var(--color-background); + color: var(--color-text-main); + line-height: 1.5; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} + +button { + cursor: pointer; + font-family: inherit; +} + +a { + color: var(--color-primary); + text-decoration: none; +} + +/* Custom Scrollbar */ +::-webkit-scrollbar { + width: 8px; + height: 8px; +} + +::-webkit-scrollbar-track { + background: transparent; +} + +::-webkit-scrollbar-thumb { + background: #cbd5e1; + border-radius: 4px; +} + +::-webkit-scrollbar-thumb:hover { + background: #94a3b8; +} diff --git a/idea1/docs/spec_bundle/src/utils/dataProcessor.js b/idea1/docs/spec_bundle/src/utils/dataProcessor.js new file mode 100644 index 0000000000000000000000000000000000000000..87ae9f586d99202e532ebd8ad07b0d47d5e3e7d7 --- /dev/null +++ b/idea1/docs/spec_bundle/src/utils/dataProcessor.js @@ -0,0 +1,117 @@ +export async function loadAllData() { + try { + const response = await fetch('./raw_data/data.json'); + const jsonData = await response.json(); + return jsonData; + } catch (error) { + console.error('Error loading JSON data:', error); + return {}; + } +} + +export function calculateSpeedup(specValue, baselineValue) { + if (!specValue || !baselineValue || baselineValue === 0) return null; + return (specValue / baselineValue).toFixed(2); +} + +export function processModelData(modelData, targetModelName) { + if (!modelData || !targetModelName) return []; + + // Map to hold aggregated entries by unique key (draftModel + config) + const entriesMap = new Map(); + + // Iterate through each benchmark in the model + Object.entries(modelData).forEach(([, benchmarkData]) => { + const benchmarkName = benchmarkData.benchmark_name; + const results = benchmarkData.results || []; + + results.forEach(result => { + const { batch_size, steps, topk, num_draft_tokens, metrics } = result; + + // Find baseline (Without EAGLE3) + const baselineMetric = metrics.find(m => m.Name === 'Wihtout EAGLE3'); + + // Process each metric entry (including baseline and EAGLE3 models) + metrics.forEach(metric => { + const isBaseline = metric.Name === 'Wihtout EAGLE3'; + const config = isBaseline ? 'baseline' : `${batch_size}-${steps}-${topk}-${num_draft_tokens}`; + + // draftModel is the Name from metrics array + const draftModel = isBaseline ? 'None' : metric.Name; + + // Use a combination of draftModel and config as the key + // This ensures baseline and EAGLE3 configs are separate entries + const key = `${draftModel}|${config}`; + + // Get or create entry + if (!entriesMap.has(key)) { + entriesMap.set(key, { + targetModel: targetModelName, + draftModel: draftModel, + config, + batch_size, + steps, + topk, + num_draft_tokens, + metrics: {}, + baseline: {} + }); + } + + const entry = entriesMap.get(key); + + // Add this benchmark's metrics + entry.metrics[benchmarkName] = { + throughput: metric.output_throughput, + accLen: metric.accept_length + }; + + // Add baseline for this benchmark + if (baselineMetric) { + entry.baseline[benchmarkName] = { + throughput: baselineMetric.output_throughput, + accLen: baselineMetric.accept_length + }; + } + }); + }); + }); + + return Array.from(entriesMap.values()); +} + +export function getTargetModels(allData) { + return Object.keys(allData); +} + +export function extractUniqueTargetModels(processedData) { + return [...new Set(processedData.map(d => d.targetModel).filter(Boolean))]; +} + +export function removeSGLangPrefix(modelName) { + if (!modelName) return modelName; + // Remove "SGLang-EAGLE3" prefix if present (handles various formats) + // Examples: "lmsys/SGLang-EAGLE3-..." -> "lmsys/..." + // "SGLang-EAGLE3/..." -> "..." + // "SGLang-EAGLE3-..." -> "..." + let cleaned = String(modelName); + + // Remove "SGLang-EAGLE3-" pattern (with hyphen after, can be preceded by / or start of string) + cleaned = cleaned.replace(/(^|\/)SGLang-EAGLE3-/gi, '$1'); + + // Remove "SGLang-EAGLE3/" pattern (with slash after) + cleaned = cleaned.replace(/(^|\/)SGLang-EAGLE3\//gi, '$1'); + + // Remove standalone "SGLang-EAGLE3" at the start (not followed by - or /) + cleaned = cleaned.replace(/^SGLang-EAGLE3(?![-\/])/gi, ''); + + // Clean up any double slashes + cleaned = cleaned.replace(/\/+/g, '/'); + + // Remove leading slash if present (unless it's the only character) + if (cleaned.length > 1) { + cleaned = cleaned.replace(/^\//, ''); + } + + return cleaned || modelName; +} diff --git a/idea1/docs/spec_bundle/vite.config.js b/idea1/docs/spec_bundle/vite.config.js new file mode 100644 index 0000000000000000000000000000000000000000..d747468c3295796728aabd7aae67de54928095c6 --- /dev/null +++ b/idea1/docs/spec_bundle/vite.config.js @@ -0,0 +1,23 @@ +import { defineConfig } from 'vite' +import vue from '@vitejs/plugin-vue' + +// https://vite.dev/config/ +export default defineConfig({ + plugins: [vue()], + base: './', // Use relative paths for deployment + build: { + outDir: 'dist', + assetsDir: 'assets', + sourcemap: false, + minify: 'esbuild', // Use esbuild for faster minification (Vite built-in) + rollupOptions: { + output: { + manualChunks: { + 'vue-vendor': ['vue'], + 'echarts-vendor': ['echarts', 'vue-echarts'], + 'csv-vendor': ['papaparse'] + } + } + } + } +}) diff --git a/idea1/examples/README.md b/idea1/examples/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ff5a6f3a8a5aae6c9ff7645afc266dc6cd7363bc --- /dev/null +++ b/idea1/examples/README.md @@ -0,0 +1,9 @@ +# Run SpecForge Examples + +This folder contains the examples of running SpecForge on different models. The scripts can be invoked by the following command: + +```bash +bash examples/.sh [NUM_GPUS] [TP_SIZE] +``` + +We use the ShareGPT dataset for all the examples for now, but you can replace it with more robust datasets such as perfectblend, magpie-qwen2.5-pro-1m-v0.1, etc. diff --git a/idea1/examples/run_deepseek_v2_lite_eagle3_online.sh b/idea1/examples/run_deepseek_v2_lite_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..283c62ce80743d52bc29aaddc3b0b7a9829890c7 --- /dev/null +++ b/idea1/examples/run_deepseek_v2_lite_eagle3_online.sh @@ -0,0 +1,25 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for deepseek-v2-lite +NUM_GPUS=${1:-8} +TP_SIZE=${2:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path deepseek-ai/DeepSeek-V2-Lite \ + --draft-model-config $ROOT_DIR/configs/deepseek-v2-lite-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/deepseek-v2-lite-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template deepseek \ + --target-model-backend hf \ + --cache-dir $ROOT_DIR/cache diff --git a/idea1/examples/run_deepseek_v3_671b_eagle3_offline.sh b/idea1/examples/run_deepseek_v3_671b_eagle3_offline.sh new file mode 100644 index 0000000000000000000000000000000000000000..4bede1dd50be44503365fb03fce8624e1bed2d4e --- /dev/null +++ b/idea1/examples/run_deepseek_v3_671b_eagle3_offline.sh @@ -0,0 +1,43 @@ + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for deepseek-v3 +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +# generate hidden states +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + scripts/prepare_hidden_states.py \ + --target-model-path deepseek-ai/DeepSeek-V3 \ + --enable-aux-hidden-states \ + --data-path $ROOT_DIR/cache/dataset/perfect-blend.jsonl \ + --output-path $ROOT_DIR/cache/hidden_states/perfect-blend-deepseek-v3 \ + --chat-template deepseek-v3 \ + --max-length 2048 \ + --tp-size 8 \ + --batch-size 4 \ + --sglang-mem-fraction-static 0.75 + +# train eagle3 offline +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path deepseek-ai/DeepSeek-V3 \ + --draft-model-config $ROOT_DIR/configs/deepseek-v3-671b-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfect-blend.jsonl \ + --train-hidden-states-path $ROOT_DIR/cache/hidden_states/perfect-blend-deepseek-v3 \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/deepseek-v3-671B-eagle3-perfect-blend-offline \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --learning-rate 5e-5 \ + --max-length 2048 \ + --chat-template deepseek-v3 \ + --cache-dir $ROOT_DIR/cache diff --git a/idea1/examples/run_deepseek_v3_671b_eagle3_online.sh b/idea1/examples/run_deepseek_v3_671b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..2eb2769f9b5582a811a83305b72ea67bef5b514b --- /dev/null +++ b/idea1/examples/run_deepseek_v3_671b_eagle3_online.sh @@ -0,0 +1,29 @@ + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for deepseek-v3 +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +# train eagle3 online +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path deepseek-ai/DeepSeek-V3 \ + --draft-model-config $ROOT_DIR/configs/deepseek-v3-671b-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfect-blend.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/deepseek-v3-671B-eagle3-perfect-blend-online \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 5e-5 \ + --max-length 2048 \ + --chat-template deepseek-v3 \ + --cache-dir $ROOT_DIR/cache \ + --dist-timeout 60 \ + --sglang-mem-fraction-static 0.75 diff --git a/idea1/examples/run_eval_dflash.sh b/idea1/examples/run_eval_dflash.sh new file mode 100644 index 0000000000000000000000000000000000000000..bd1bbf953e33ccf777f59ffa6702ee29f59d9aa6 --- /dev/null +++ b/idea1/examples/run_eval_dflash.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# DFlash evaluation: compare baseline vs multi-step denoising (8 GPU data parallel) +# +# Usage: +# bash examples/run_eval_dflash.sh # run step=1,2,3 all +# bash examples/run_eval_dflash.sh 2 # only step=2 +# +# Each GPU loads target+draft model independently, samples are split across GPUs. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# Activate conda env +source /workspace/miniconda3/etc/profile.d/conda.sh +conda activate specforge + +export PYTHONPATH=$ROOT_DIR:$PYTHONPATH +export HF_DATASETS_CACHE=/workspace/hanrui/datasets +export HF_HOME=/workspace/hanrui/cache/specforge_hf_home +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# ============ Configuration ============ +NUM_GPUS=${NUM_GPUS:-8} +TARGET_MODEL=${TARGET_MODEL:-"/workspace/models/Qwen3-8B"} +DRAFT_MODEL=${DRAFT_MODEL:-"/workspace/models/Qwen3-8B-DFlash-b16"} +DATASET=${DATASET:-"math500"} +MAX_SAMPLES=${MAX_SAMPLES:-500} +MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-512} +TEMPERATURE=${TEMPERATURE:-0.0} +OUTPUT_DIR=${OUTPUT_DIR:-"$ROOT_DIR/results/dflash_eval"} +# ======================================== + +mkdir -p $OUTPUT_DIR + +run_eval() { + local steps=$1 + echo "" + echo "============================================" + echo " Running DFlash eval: denoise_steps=$steps" + echo " GPUs: $NUM_GPUS, Samples: $MAX_SAMPLES" + echo "============================================" + + torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/eval_dflash.py \ + --target-model-path $TARGET_MODEL \ + --draft-model-path $DRAFT_MODEL \ + --dataset $DATASET \ + --max-samples $MAX_SAMPLES \ + --max-new-tokens $MAX_NEW_TOKENS \ + --num-denoise-steps $steps \ + --temperature $TEMPERATURE \ + --output-file $OUTPUT_DIR/${DATASET}_steps${steps}.json \ + 2>&1 | tee $OUTPUT_DIR/${DATASET}_steps${steps}.log +} + +if [ -n "$1" ]; then + run_eval $1 +else + run_eval 1 + run_eval 2 + run_eval 3 + + echo "" + echo "============================================" + echo " All evaluations complete!" + echo " Results in: $OUTPUT_DIR/" + echo "============================================" + echo "" + echo "Quick comparison:" + for f in $OUTPUT_DIR/${DATASET}_steps*.json; do + steps=$(echo $f | grep -oP 'steps\K[0-9]+') + tau=$(python -c "import json; d=json.load(open('$f')); print(f'{d[\"results\"][\"avg_tau\"]:.2f}')" 2>/dev/null || echo "N/A") + echo " steps=$steps avg_tau=$tau" + done +fi diff --git a/idea1/examples/run_gemma3_1b_eagle3_online.sh b/idea1/examples/run_gemma3_1b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..a1365069594baae0f7b8acbc45640c5ea39e0731 --- /dev/null +++ b/idea1/examples/run_gemma3_1b_eagle3_online.sh @@ -0,0 +1,26 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# train eagle3 for gemma3-1b +NUM_GPUS=${1:-1} +TP_SIZE=${2:-1} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path google/gemma-3-1b-it \ + --draft-model-config $ROOT_DIR/configs/gemma3-1b-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --output-dir $ROOT_DIR/outputs/gemma3-1b-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template gemma \ + --cache-dir $ROOT_DIR/cache \ + --attention-backend sdpa \ + --target-model-backend hf \ + --log-interval 10 diff --git a/idea1/examples/run_gpt_oss_120b_eagle3_online.sh b/idea1/examples/run_gpt_oss_120b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..eea5afbd8a6512945d35f4b005c338fc5f1671a8 --- /dev/null +++ b/idea1/examples/run_gpt_oss_120b_eagle3_online.sh @@ -0,0 +1,26 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for GPT-OSS-120B +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path openai/gpt-oss-120b \ + --draft-model-config $ROOT_DIR/configs/gpt-oss-20B-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfect-blend-gptoss-20B.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/gpt-oss-20b-eagle3 \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template gpt-oss \ + --cache-dir $ROOT_DIR/cache \ + --dist-timeout 60 diff --git a/idea1/examples/run_gpt_oss_20b_eagle3_online.sh b/idea1/examples/run_gpt_oss_20b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..55baeac1c49576e25162b6ff78558a7df8c4ee2d --- /dev/null +++ b/idea1/examples/run_gpt_oss_20b_eagle3_online.sh @@ -0,0 +1,26 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for GPT-OSS-20B +NUM_GPUS=${1:-8} +TP_SIZE=${2:-2} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path openai/gpt-oss-20b \ + --draft-model-config $ROOT_DIR/configs/gpt-oss-20B-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfect-blend-gptoss-20B.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/perfect-blend-gptoss-20b-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template gpt-oss \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --cache-dir $ROOT_DIR/cache \ + --dist-timeout 60 diff --git a/idea1/examples/run_ling_flash_2.0_eagle3_offline.sh b/idea1/examples/run_ling_flash_2.0_eagle3_offline.sh new file mode 100644 index 0000000000000000000000000000000000000000..f7f2925b4bd3381d0d9984a59db7e3b0f3699faa --- /dev/null +++ b/idea1/examples/run_ling_flash_2.0_eagle3_offline.sh @@ -0,0 +1,45 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for ling-flash-2.0 +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +# generate hidden states +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + scripts/prepare_hidden_states.py \ + --target-model-path inclusionAI/Ling-flash-2.0 \ + --enable-aux-hidden-states \ + --data-path $ROOT_DIR/cache/dataset/perfect-blend.jsonl \ + --output-path $ROOT_DIR/cache/hidden_states/perfect-blend-ling-flash-2.0 \ + --chat-template ling-flash-2.0 \ + --max-length 2048 \ + --tp-size $TP_SIZE \ + --batch-size 4 \ + --sglang-mem-fraction-static 0.75 \ + --trust-remote-code + +# train eagle3 offline +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path inclusionAI/Ling-flash-2.0 \ + --draft-model-config $ROOT_DIR/configs/ling-flash-2.0-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfect-blend.jsonl \ + --train-hidden-states-path $ROOT_DIR/cache/hidden_states/perfect-blend-ling-flash-2.0 \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/ling-flash-2.0-eagle3-perfect-blend-offline \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --learning-rate 5e-5 \ + --max-length 2048 \ + --chat-template ling-flash-2.0 \ + --embedding-key 'model.word_embeddings.weight' \ + --cache-dir $ROOT_DIR/cache \ + --trust-remote-code diff --git a/idea1/examples/run_ling_flash_2.0_eagle3_online.sh b/idea1/examples/run_ling_flash_2.0_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..8f9d1cc3d87905107b0708b99ec8a32a832831a2 --- /dev/null +++ b/idea1/examples/run_ling_flash_2.0_eagle3_online.sh @@ -0,0 +1,30 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for ling-flash-2.0 +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +# train eagle3 online +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path inclusionAI/Ling-flash-2.0 \ + --draft-model-config $ROOT_DIR/configs/ling-flash-2.0-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfect-blend.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/ling-flash-2.0-eagle3-perfect-blend-online \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 5e-5 \ + --max-length 2048 \ + --chat-template ling-flash-2.0 \ + --cache-dir $ROOT_DIR/cache \ + --dist-timeout 60 \ + --sglang-mem-fraction-static 0.75 \ + --embedding-key 'model.word_embeddings.weight' \ + --trust-remote-code diff --git a/idea1/examples/run_llama3.1_8b_eagle3_offline.sh b/idea1/examples/run_llama3.1_8b_eagle3_offline.sh new file mode 100644 index 0000000000000000000000000000000000000000..dffcbef845b727e6be2eeb2f24d63de2cc8b693f --- /dev/null +++ b/idea1/examples/run_llama3.1_8b_eagle3_offline.sh @@ -0,0 +1,39 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +NUM_GPUS=${1:-1} +TP_SIZE=${2:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +# generate hidden states +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --enable-aux-hidden-states \ + --data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --output-path $ROOT_DIR/cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --chat-template llama3 \ + --max-length 4096 \ + --tp-size $TP_SIZE \ + --batch-size 32 + +# train eagle3 offline +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config $ROOT_DIR/configs/llama3-8B-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --train-hidden-states-path $ROOT_DIR/cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/llama3-8b-eagle3-sharegpt-offline \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --target-model-backend sglang \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --cache-dir $ROOT_DIR/cache diff --git a/idea1/examples/run_llama3.1_8b_eagle3_online.sh b/idea1/examples/run_llama3.1_8b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..d47c1797fa14aaaba21784f108ea2c270163b805 --- /dev/null +++ b/idea1/examples/run_llama3.1_8b_eagle3_online.sh @@ -0,0 +1,29 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels +# train eagle3 for llama3.1-8b +NUM_GPUS=${1:-1} +TP_SIZE=${2:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --draft-model-config $ROOT_DIR/configs/llama3-8B-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/llama3-8b-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --cache-dir $ROOT_DIR/cache \ + --attention-backend sdpa \ + --target-model-backend sglang \ + --log-interval 10 \ + --sglang-mem-fraction-static 0.25 diff --git a/idea1/examples/run_llama3.3_70b_eagle3_online.sh b/idea1/examples/run_llama3.3_70b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..0ea80413df676a319d8a1a38eaeb036d21d3321d --- /dev/null +++ b/idea1/examples/run_llama3.3_70b_eagle3_online.sh @@ -0,0 +1,25 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for llama3.1-8b +NUM_GPUS=${1:-8} +TP_SIZE=${2:-4} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-3.3-70B-Instruct \ + --draft-model-config $ROOT_DIR/configs/llama3-70B-ealge3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/llama3.3-70b-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama3 \ + --cache-dir $ROOT_DIR/cache \ + --target-model-backend sglang diff --git a/idea1/examples/run_llama4_scout_eagle3_online.sh b/idea1/examples/run_llama4_scout_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..73ed03a617297b64569cafecfd5bce9a2cf8f940 --- /dev/null +++ b/idea1/examples/run_llama4_scout_eagle3_online.sh @@ -0,0 +1,25 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# train eagle3 for llama3.1-8b +NUM_GPUS=${1:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \ + --draft-model-config $ROOT_DIR/configs/llama4-scout-17B-16E-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/llama4-scout-17B-16E-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template llama4 \ + --cache-dir $ROOT_DIR/cache \ + --tp-size 8 \ + --embedding-key language_model.model.embed_tokens.weight \ diff --git a/idea1/examples/run_longcat_flash_dflash_online.sh b/idea1/examples/run_longcat_flash_dflash_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..eea8c02b903f2519b5e7e568cb5d8f4139e67af5 --- /dev/null +++ b/idea1/examples/run_longcat_flash_dflash_online.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels +export SPECFORGE_DATA_NUM_PROC=${SPECFORGE_DATA_NUM_PROC:-64} + +NUM_GPUS=${1:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} +WANDB_MODE=offline +SGL_JIT_DEEPGEMM_PRECOMPILE=false +SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_dflash.py \ + --target-model-path meituan-longcat/LongCat-Flash-Chat-FP8 \ + --target-model-backend sglang \ + --tp-size $NUM_GPUS \ + --sglang-attention-backend flashinfer \ + --sglang-mem-fraction-static 0.75 \ + --sglang-ep-size $NUM_GPUS \ + --draft-config-path $ROOT_DIR/configs/longcat-flash-dflash.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/longcat-flash-dflash-sharegpt \ + --num-epochs 6 \ + --batch-size 2 \ + --learning-rate 6e-4 \ + --warmup-ratio 0.04 \ + --max-grad-norm 1.0 \ + --max-length 3072 \ + --chat-template longcat \ + --num-anchors 512 \ + --loss-decay-gamma 7.0 \ + --log-interval 50 \ + --save-interval 1000 \ + --report-to wandb \ + --wandb-project specforge-longcat-flash-dflash \ + --wandb-name longcat-flash-dflash-sharegpt \ + --mask-token-id 2 diff --git a/idea1/examples/run_longcat_flash_eagle3_online.sh b/idea1/examples/run_longcat_flash_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..f89cb502009610f0f19db6fef25c268ff1c8f641 --- /dev/null +++ b/idea1/examples/run_longcat_flash_eagle3_online.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels +NUM_GPUS=${1:-1} +TP_SIZE=${2:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path meituan-longcat/LongCat-Flash-Chat-FP8 \ + --draft-model-config $ROOT_DIR/configs/longcat-flash-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/longcat-flash-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 2048 \ + --chat-template longcat \ + --cache-dir $ROOT_DIR/cache \ + --attention-backend sdpa \ + --target-model-backend sglang \ + --log-interval 10 \ + --sglang-mem-fraction-static 0.75 \ + --sglang-attention-backend flashinfer \ + --sglang-ep-size $NUM_GPUS diff --git a/idea1/examples/run_phi4_eagle3_online.sh b/idea1/examples/run_phi4_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..f306d22e71941b295bab03695a7f3c3187fc54d5 --- /dev/null +++ b/idea1/examples/run_phi4_eagle3_online.sh @@ -0,0 +1,27 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +NUM_GPUS=${1:-1} +TP_SIZE=${2:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path microsoft/phi-4 \ + --draft-model-config $ROOT_DIR/configs/phi4-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/phi4-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 2048 \ + --chat-template phi4 \ + --cache-dir $ROOT_DIR/cache \ + --target-model-backend sglang \ + --embedding-key model.embed_tokens.weight diff --git a/idea1/examples/run_qwen2.5_32b_vl_eagle3_online.sh b/idea1/examples/run_qwen2.5_32b_vl_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7c86b0e502e19a1e39f42860d4804c768e84642 --- /dev/null +++ b/idea1/examples/run_qwen2.5_32b_vl_eagle3_online.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# support tp1 train eagle3 for qwen2.5-vl-7b-instruct +NUM_GPUS=${1:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen2.5-VL-32B-Instruct \ + --draft-model-config $ROOT_DIR/configs/qwen2.5-vl-32b-eagle3.json \ + --train-data-path $ROOT_DIR/cache/allava4v_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/qwen2.5-vl-32b-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --dist-timeout 360 \ + --chat-template qwen2-vl \ + --target-model-backend sglang \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size 4 \ + --sglang-mem-fraction-static 0.5 \ + --is-vlm \ + --min-pixels 200704 \ + --max-pixels 1003520 diff --git a/idea1/examples/run_qwen2.5_7b_vl_eagle3_online.sh b/idea1/examples/run_qwen2.5_7b_vl_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..e94e6e39882484f0e6590e72686d594ba4bf1ff0 --- /dev/null +++ b/idea1/examples/run_qwen2.5_7b_vl_eagle3_online.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# support tp1 train eagle3 for qwen2.5-vl-7b-instruct +NUM_GPUS=${1:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen2.5-VL-7B-Instruct \ + --draft-model-config $ROOT_DIR/configs/qwen2-5-vl-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/allava4v_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/Qwen2.5-VL-7B-eagle3 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 8192 \ + --dist-timeout 360 \ + --chat-template qwen2-vl \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size 1 \ + --is-vlm \ + --min-pixels 50176 \ + --max-pixels 802816 diff --git a/idea1/examples/run_qwen3.5_35b_a3b_dflash_online.sh b/idea1/examples/run_qwen3.5_35b_a3b_dflash_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..a44b5c2dd3801c8a987ad754d7c00b5c2ebe7327 --- /dev/null +++ b/idea1/examples/run_qwen3.5_35b_a3b_dflash_online.sh @@ -0,0 +1,41 @@ + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for Qwen3.5-35B-A3B on ultrachat with online data collection and training +TP_SIZE=1 +BUILD_DATASET_NUM_PROC=64 + +export HF_DATASETS_CACHE=$ROOT_DIR/cache/hf_datasets +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +ATTENTION_BACKEND=${2:-flex_attention} +NUM_GPUS=4 + +CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_dflash.py \ + --target-model-path /data/jiapingW/pretrained_models/Qwen3.5-35B-A3B \ + --draft-config-path $ROOT_DIR/configs/qwen3.5-35b-a3b-dflash.json \ + --train-data-path $ROOT_DIR/cache/dataset/opc_train_regen_first_turn.jsonl \ + --output-dir $ROOT_DIR/outputs/qwen3.5-35a-a3b-dflash-opc \ + --num-epochs 10 \ + --batch-size 2 \ + --learning-rate 6e-4 \ + --warmup-ratio 0.04 \ + --max-grad-norm 1.0 \ + --max-length 4096 \ + --chat-template qwen3.5 \ + --attention-backend $ATTENTION_BACKEND \ + --num-anchors 512 \ + --loss-decay-gamma 7.0 \ + --log-interval 50 \ + --save-interval 10000 \ + --report-to tensorboard \ + --target-model-backend sglang \ + --block-size 16 \ + --num-anchors 512 \ + --sglang-mem-fraction-static 0.5 \ + --embedding-key model.language_model.embed_tokens.weight \ + --resume diff --git a/idea1/examples/run_qwen3.5_35b_a3b_eagle3_offline.sh b/idea1/examples/run_qwen3.5_35b_a3b_eagle3_offline.sh new file mode 100644 index 0000000000000000000000000000000000000000..b55ef3b14e21561c77a3ab1730db192b53cfe313 --- /dev/null +++ b/idea1/examples/run_qwen3.5_35b_a3b_eagle3_offline.sh @@ -0,0 +1,44 @@ + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for deepseek-v3 +NUM_GPUS=4 +# TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +# generate hidden states +CUDA_VISIBLE_DEVICES=1,2,3,5 torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + scripts/prepare_hidden_states.py \ + --target-model-path /data/jiapingW/pretrained_models/Qwen3.5-35B-A3B \ + --enable-aux-hidden-states \ + --data-path $ROOT_DIR/cache/dataset/ultrachat_train.jsonl \ + --output-path $ROOT_DIR/cache/hidden_states/qwen3.5-35b-a3b-ultrachat \ + --chat-template qwen \ + --max-length 4096 \ + --tp-size 1 \ + --batch-size 4 \ + --sglang-mem-fraction-static 0.7 + + +# NUM_GPUS=2 +# CUDA_VISIBLE_DEVICES=6,7 torchrun \ +# --standalone \ +# --nproc_per_node $NUM_GPUS \ +# $ROOT_DIR/scripts/train_eagle3.py \ +# --target-model-path /data/jiapingW/pretrained_models/Qwen3.5-35B-A3B \ +# --draft-model-config $ROOT_DIR/configs/qwen3.5-35b-a3b-eagle3.json \ +# --train-data-path $ROOT_DIR/cache/dataset/ultrachat_train.jsonl \ +# --train-hidden-states-path $ROOT_DIR/cache/hidden_states/qwen3.5-35b-a3b-ultrachat \ +# --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ +# --output-dir $ROOT_DIR/outputs/qwen3.5-35b-a3b-ultrachat \ +# --num-epochs 10 \ +# --batch-size 1 \ +# --tp-size 1 \ +# --learning-rate 5e-5 \ +# --max-length 4096 \ +# --chat-template qwen \ +# --cache-dir $ROOT_DIR/cache \ +# --embedding-key "model.language_model.embed_tokens.weight" diff --git a/idea1/examples/run_qwen3.5_35b_a3b_eagle3_online.sh b/idea1/examples/run_qwen3.5_35b_a3b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..1c33859212ea786c3e2156c8b8f2a6eaa322d162 --- /dev/null +++ b/idea1/examples/run_qwen3.5_35b_a3b_eagle3_online.sh @@ -0,0 +1,31 @@ + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# train eagle3 for Qwen3.5-35B-A3B on ultrachat with online data collection and training +TP_SIZE=1 +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +export HF_DATASETS_CACHE=$ROOT_DIR/cache/hf_datasets + +NUM_GPUS=2 +CUDA_VISIBLE_DEVICES=0,1 torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen3.5-35B-A3B \ + --draft-model-config $ROOT_DIR/configs/qwen3.5-35b-a3b-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/ultrachat_train_regen_first_turn.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/qwen3.5-35b-a3b-ultrachat-regen \ + --num-epochs 10 \ + --batch-size 1 \ + --tp-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 8192 \ + --chat-template qwen3.5 \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key "model.language_model.embed_tokens.weight" \ + --sglang-mem-fraction-static 0.6 \ + --save-interval 5000 \ + --report-to tensorboard diff --git a/idea1/examples/run_qwen3_235b_a22b_eagle3.sh b/idea1/examples/run_qwen3_235b_a22b_eagle3.sh new file mode 100644 index 0000000000000000000000000000000000000000..c96b42cb6267bce71fe520669544d9a59eb1cffc --- /dev/null +++ b/idea1/examples/run_qwen3_235b_a22b_eagle3.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# support tp4/tp8 train eagle3 for Qwen3-30B-A3B +NUM_GPUS=8 +TP_SIZE=4 +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path /workdir/huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct-FP8/\ + --draft-model-config $ROOT_DIR/configs/qwen3-next-80b-a3b-eagle3.json \ + --train-data-path /workdir/data_qwen80b/qwen3_80b_perfectblend_train_regen.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir /workdir/qwen3-80b-regen-blend \ + --num-epochs 2 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template qwen \ + --cache-dir /workdir/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size $TP_SIZE \ + --target-model-backend sglang diff --git a/idea1/examples/run_qwen3_30b_a3b_eagle3_online.sh b/idea1/examples/run_qwen3_30b_a3b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..29b5ac167b044ea321e8f25a5f6f0f5b088dc90c --- /dev/null +++ b/idea1/examples/run_qwen3_30b_a3b_eagle3_online.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# support tp4/tp8 train eagle3 for Qwen3-30B-A3B +NUM_GPUS=${1:-4} +TP_SIZE=${2:-4} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen3-30B-A3B-Instruct-2507 \ + --draft-model-config $ROOT_DIR/configs/qwen3-30B-A3B-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/qwen3-30b-a3b-instruct-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template qwen \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size $TP_SIZE \ + --target-model-backend sglang diff --git a/idea1/examples/run_qwen3_8b_dflash_hf.sh b/idea1/examples/run_qwen3_8b_dflash_hf.sh new file mode 100644 index 0000000000000000000000000000000000000000..bfe45e0342c675f3ec71eb6c48b939a4121d6674 --- /dev/null +++ b/idea1/examples/run_qwen3_8b_dflash_hf.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# SpecForge DFlash training script for Qwen3-8B +# Based on official run_qwen3_8b_dflash_online.sh +# Changes: HF backend, local model path, Nemotron-CodeAlpaca dataset +# +# Usage: bash examples/run_qwen3_8b_dflash_hf.sh [NUM_GPUS] [ATTENTION_BACKEND] +# First run: cd /workspace/hanrui/SpecForge && pip install -e . + +# Activate conda env +source /workspace/miniconda3/etc/profile.d/conda.sh +conda activate specforge + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels +export SPECFORGE_DATA_NUM_PROC=32 +export HF_DATASETS_CACHE=/workspace/hanrui/cache/specforge_hf_datasets +export HF_HOME=/workspace/hanrui/cache/specforge_hf_home +NUM_GPUS=${1:-8} + +ATTENTION_BACKEND=${2:-flex_attention} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_dflash.py \ + --target-model-path /workspace/models/Qwen3-8B \ + --draft-config-path $ROOT_DIR/configs/qwen3-8b-dflash.json \ + --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \ + --output-dir $ROOT_DIR/outputs/qwen3-8b-dflash-hf \ + --num-epochs 6 \ + --batch-size 4 \ + --learning-rate 6e-4 \ + --warmup-ratio 0.04 \ + --max-grad-norm 1.0 \ + --max-length 3072 \ + --chat-template qwen \ + --attention-backend $ATTENTION_BACKEND \ + --num-anchors 512 \ + --loss-decay-gamma 7.0 \ + --log-interval 50 \ + --save-interval 1000 \ + --report-to wandb \ + --wandb-project specforge-qwen3-8b-dflash \ + --wandb-name qwen3-8b-dflash-hf \ + --target-model-backend hf \ + --block-size 16 \ + --num-anchors 512 diff --git a/idea1/examples/run_qwen3_8b_dflash_online.sh b/idea1/examples/run_qwen3_8b_dflash_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..a52a5ec25ce05129ed076cdbd5d1592e06ec6076 --- /dev/null +++ b/idea1/examples/run_qwen3_8b_dflash_online.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels +export SPECFORGE_DATA_NUM_PROC=32 +NUM_GPUS=${1:-8} + +ATTENTION_BACKEND=${2:-flex_attention} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_dflash.py \ + --target-model-path Qwen/Qwen3-8B \ + --draft-config-path $ROOT_DIR/configs/qwen3-8b-dflash.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfectblend_qwen3-8b_regen.jsonl \ + --output-dir $ROOT_DIR/outputs/qwen3-8b-perfectblend \ + --num-epochs 6 \ + --batch-size 4 \ + --learning-rate 6e-4 \ + --warmup-ratio 0.04 \ + --max-grad-norm 1.0 \ + --max-length 3072 \ + --chat-template qwen \ + --attention-backend $ATTENTION_BACKEND \ + --num-anchors 512 \ + --loss-decay-gamma 7.0 \ + --log-interval 50 \ + --save-interval 1000 \ + --report-to wandb \ + --wandb-project specforge-qwen3-8b-dflash \ + --target-model-backend sglang \ + --block-size 16 \ + --num-anchors 512 \ + --wandb-name qwen3-8b-dflash-perfectblend diff --git a/idea1/examples/run_qwen3_8b_eagle3_online.sh b/idea1/examples/run_qwen3_8b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..4aa79654701fb6c05f6430140657ba9e550e4f13 --- /dev/null +++ b/idea1/examples/run_qwen3_8b_eagle3_online.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# support tp8 train eagle3 for Qwen3-4B/8B/32B up to tp_size = 8 +NUM_GPUS=${1:-1} +TP_SIZE=${2:-1} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen3-8B \ + --draft-model-config $ROOT_DIR/configs/qwen3-8b-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/qwen3-8b-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template qwen \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size $TP_SIZE \ + --target-model-backend sglang diff --git a/idea1/examples/run_qwen3_coder_30b_a3b_eagle3_online.sh b/idea1/examples/run_qwen3_coder_30b_a3b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..b88d5fcdca1cf34ae6f0b050c0a5af390cf50c05 --- /dev/null +++ b/idea1/examples/run_qwen3_coder_30b_a3b_eagle3_online.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# Train EAGLE3 draft model for Qwen3-Coder-30B-A3B-Instruct +# Uses the regenerated OPC dataset and TP=4 on GPUs 4,5,6,7 + +# GPU Configuration - Use the later 4 GPUs (4,5,6,7) +export CUDA_VISIBLE_DEVICES=4,5,6,7 +NUM_GPUS=4 +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen3-Coder-30B-A3B-Instruct \ + --draft-model-config $ROOT_DIR/configs/qwen3-coder-30B-A3B-instruct-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/opc_regenerated.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/qwen3-coder-30b-a3b-instruct-eagle3-opc-regen \ + --num-epochs 2 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template qwen \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size 4 \ + --dist-timeout 60 \ + --log-interval 50 \ + --save-interval 5000 \ + --eval-interval 5000 \ + --report-to wandb \ + --wandb-project specforge-qwen3-coder \ + --wandb-name qwen3-coder-30b-eagle3-tp4-opc-regen diff --git a/idea1/examples/run_qwen3_coder_eagle3_offline.sh b/idea1/examples/run_qwen3_coder_eagle3_offline.sh new file mode 100644 index 0000000000000000000000000000000000000000..f7d0f272bfcd23a1073ee6ca012222b7a8a0df82 --- /dev/null +++ b/idea1/examples/run_qwen3_coder_eagle3_offline.sh @@ -0,0 +1,26 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# train eagle3 for qwen3-coder +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen3-Coder-480B-A35B-Instruct \ + --draft-model-config $ROOT_DIR/configs/qwen3-coder-480B-A35B-instruct-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/opc.jsonl \ + --train-hidden-states-path $ROOT_DIR/cache/hidden_states \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/Qwen3-Coder-480B-A35B-Instruct \ + --num-epochs 10 \ + --draft-micro-batch-size 1 \ + --draft-global-batch-size $TP_SIZE \ + --learning-rate 1e-4 \ + --max-length 2048 \ + --chat-template qwen \ + --target-model-backend sglang diff --git a/idea1/examples/run_qwen3_coder_eagle3_online.sh b/idea1/examples/run_qwen3_coder_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..77f7803301b9678538dcbcde664ab8c74b5451a4 --- /dev/null +++ b/idea1/examples/run_qwen3_coder_eagle3_online.sh @@ -0,0 +1,33 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# train eagle3 for qwen3-coder +NUM_GPUS=${1:-8} +TP_SIZE=${2:-8} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \ + --draft-model-config $ROOT_DIR/configs/qwen3-coder-480B-A35B-instruct-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/opc_regenerated.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/Qwen3-Coder-480B-A35B-Instruct-FP8 \ + --tp-size $TP_SIZE \ + --sglang-ep-size 2 \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-5 \ + --ttt-length 13 \ + --sglang-mem-fraction-static 0.6 \ + --max-length 2048 \ + --chat-template qwen \ + --target-model-backend sglang \ + --save-interval 20000 \ + --eval-interval 20000 \ + --report-to wandb \ + --wandb-project specforge-qwen3-480-coder-fp8 \ + --wandb-name qwen3-coder-480b-a35b-eagle3-tp8-ep2-opc-regen diff --git a/idea1/examples/run_qwen3_next_80b_eagle3_online.sh b/idea1/examples/run_qwen3_next_80b_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..14838913f92ca64ccc7fc49f70389d834bf815d7 --- /dev/null +++ b/idea1/examples/run_qwen3_next_80b_eagle3_online.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +NUM_GPUS=${1:-8} +TP_SIZE=4 +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path $ROOT_DIR//Qwen/Qwen3-Next-80B-A3B-Instruct-FP8/\ + --draft-model-config $ROOT_DIR/configs/qwen3-next-80b-a3b-eagle3.json \ + --train-data-path $ROOT_DIR/data_qwen80b/qwen3_80b_perfectblend_train_regen.jsonl \ + --output-dir $ROOT_DIR/qwen3-80b-regen-blend \ + --num-epochs 2 \ + --batch-size 2 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template qwen \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size $TP_SIZE \ + --sglang-mem-fraction-static 0.5 \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --target-model-backend sglang diff --git a/idea1/examples/run_qwq_eagle3_online.sh b/idea1/examples/run_qwq_eagle3_online.sh new file mode 100644 index 0000000000000000000000000000000000000000..2b2fae6f19bba8a55a93e134f4cc848e18767d02 --- /dev/null +++ b/idea1/examples/run_qwq_eagle3_online.sh @@ -0,0 +1,28 @@ +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +# train eagle3 for qwq-32b +NUM_GPUS=${1:-4} +TP_SIZE=${2:-4} +BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_eagle3.py \ + --target-model-path Qwen/QwQ-32B \ + --draft-model-config $ROOT_DIR/configs/qwq-32B-eagle3.json \ + --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --output-dir $ROOT_DIR/outputs/qwq-32b-eagle3-sharegpt \ + --num-epochs 10 \ + --batch-size 1 \ + --learning-rate 1e-4 \ + --max-length 4096 \ + --chat-template qwen \ + --cache-dir $ROOT_DIR/cache \ + --embedding-key model.embed_tokens.weight \ + --tp-size $TP_SIZE \ + --target-model-backend sglang diff --git a/idea1/results/dflash_eval/math500_steps1.json b/idea1/results/dflash_eval/math500_steps1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9f904365b88c0a93c11d65ac79d8b8e07a238a8 --- /dev/null +++ b/idea1/results/dflash_eval/math500_steps1.json @@ -0,0 +1,63 @@ +{ + "config": { + "target_model": "/workspace/models/Qwen3-8B", + "draft_model": "/workspace/models/Qwen3-8B-DFlash-b16", + "dataset": "math500", + "max_samples": 2, + "max_new_tokens": 64, + "num_denoise_steps": 1, + "temperature": 0.0, + "num_gpus": 1 + }, + "results": { + "avg_tau": 3.72972972972973, + "median_tau": 2, + "per_sample_tau": [ + 3.1818181818181817, + 4.533333333333333 + ], + "total_blocks": 37, + "total_tokens": 128, + "total_gpu_time": 1.8252148628234863, + "wall_clock_time": 1.2869646549224854, + "all_acceptance_lengths": [ + 1, + 1, + 2, + 3, + 3, + 2, + 10, + 2, + 3, + 2, + 3, + 1, + 2, + 2, + 1, + 2, + 2, + 10, + 1, + 5, + 5, + 7, + 1, + 1, + 1, + 2, + 1, + 3, + 3, + 4, + 4, + 16, + 16, + 1, + 3, + 6, + 6 + ] + } +} \ No newline at end of file diff --git a/idea1/results/dflash_eval/math500_steps1.log b/idea1/results/dflash_eval/math500_steps1.log new file mode 100644 index 0000000000000000000000000000000000000000..245e1db25acb4fde766cf3e9c4d805a0df5b394b --- /dev/null +++ b/idea1/results/dflash_eval/math500_steps1.log @@ -0,0 +1,60 @@ +Set TORCH_CUDA_ARCH_LIST to 9.0 +/workspace/hanrui/idea1/specforge/modeling/draft/llama3_eagle.py:29: UserWarning: flash_attn is not found, falling back to flex_attention. Please install flash_attn if you want to use the flash attention backend. + warnings.warn( +:1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +:1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +============================================================ +DFlash Evaluation (Multi-GPU Data Parallel) +============================================================ + Target model: /workspace/models/Qwen3-8B + Draft model: /workspace/models/Qwen3-8B-DFlash-b16 + Dataset: math500 + Max samples: 2 + Max new tokens: 64 + Denoise steps: 1 + Temperature: 0.0 + GPUs: 1 + Dtype: bfloat16 +============================================================ + +[1/4] Loading tokenizer... +[2/4] Loading target model on 1 GPUs... +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/5 [00:00 Okay, so I need to convert the rectangular coordinates (0, 3) to polar c... + [GPU 0] Sample 2/2 | tokens=64 | tau=4.53 | time=0.5s | Okay, so I need to find a way to express the double sum $\sum_{j = 1}^\i... + +============================================================ +RESULTS SUMMARY +============================================================ + Denoise steps: 1 + GPUs used: 1 + Samples evaluated: 2 + Total blocks: 37 + Total generated tokens: 128 + Total GPU-time: 1.83s + Wall-clock time (approx): 1.29s + --- + Avg acceptance length (tau): 3.73 + Median acceptance length: 2.0 + Per-sample avg tau: ['3.18', '4.53'] + Min per-sample tau: 3.18 + Max per-sample tau: 4.53 +============================================================ + +Results saved to /workspace/hanrui/idea1/results/dflash_eval/math500_steps1.json diff --git a/idea1/results/dflash_eval/math500_steps2.json b/idea1/results/dflash_eval/math500_steps2.json new file mode 100644 index 0000000000000000000000000000000000000000..c684c46e0aca9a07432ec514d2e895b3afaf605d --- /dev/null +++ b/idea1/results/dflash_eval/math500_steps2.json @@ -0,0 +1,104 @@ +{ + "config": { + "target_model": "/workspace/models/Qwen3-8B", + "draft_model": "/workspace/models/Qwen3-8B-DFlash-b16", + "dataset": "math500", + "max_samples": 2, + "max_new_tokens": 64, + "num_denoise_steps": 2, + "temperature": 0.0, + "num_gpus": 1 + }, + "results": { + "avg_tau": 1.6538461538461537, + "median_tau": 1.0, + "per_sample_tau": [ + 1.5609756097560976, + 1.7567567567567568 + ], + "total_blocks": 78, + "total_tokens": 128, + "total_gpu_time": 3.5768094062805176, + "wall_clock_time": 2.1519975662231445, + "all_acceptance_lengths": [ + 1, + 1, + 2, + 1, + 2, + 3, + 2, + 4, + 2, + 1, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 1, + 2, + 1, + 1, + 2, + 1, + 1, + 1, + 3, + 1, + 1, + 1, + 1, + 3, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 3, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 1, + 1, + 4, + 2, + 3, + 1, + 1, + 3, + 3, + 4, + 1, + 2, + 1, + 1, + 2, + 5, + 1, + 1, + 2, + 2, + 1, + 1, + 1, + 1, + 2, + 3, + 1, + 1, + 2, + 3 + ] + } +} \ No newline at end of file diff --git a/idea1/results/dflash_eval/math500_steps2.log b/idea1/results/dflash_eval/math500_steps2.log new file mode 100644 index 0000000000000000000000000000000000000000..5cfc0e394a39112908b4c9306093d867c1518319 --- /dev/null +++ b/idea1/results/dflash_eval/math500_steps2.log @@ -0,0 +1,60 @@ +Set TORCH_CUDA_ARCH_LIST to 9.0 +/workspace/hanrui/idea1/specforge/modeling/draft/llama3_eagle.py:29: UserWarning: flash_attn is not found, falling back to flex_attention. Please install flash_attn if you want to use the flash attention backend. + warnings.warn( +:1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +:1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +============================================================ +DFlash Evaluation (Multi-GPU Data Parallel) +============================================================ + Target model: /workspace/models/Qwen3-8B + Draft model: /workspace/models/Qwen3-8B-DFlash-b16 + Dataset: math500 + Max samples: 2 + Max new tokens: 64 + Denoise steps: 2 + Temperature: 0.0 + GPUs: 1 + Dtype: bfloat16 +============================================================ + +[1/4] Loading tokenizer... +[2/4] Loading target model on 1 GPUs... +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/5 [00:00 Okay, so I need to convert the rectangular coordinates (0, 3) to polar c... + [GPU 0] Sample 2/2 | tokens=64 | tau=1.76 | time=1.4s | Okay, so I need to find a way to express the double sum $\sum_{j = 1}^\i... + +============================================================ +RESULTS SUMMARY +============================================================ + Denoise steps: 2 + GPUs used: 1 + Samples evaluated: 2 + Total blocks: 78 + Total generated tokens: 128 + Total GPU-time: 3.58s + Wall-clock time (approx): 2.15s + --- + Avg acceptance length (tau): 1.65 + Median acceptance length: 1.0 + Per-sample avg tau: ['1.56', '1.76'] + Min per-sample tau: 1.56 + Max per-sample tau: 1.76 +============================================================ + +Results saved to /workspace/hanrui/idea1/results/dflash_eval/math500_steps2.json diff --git a/idea1/results/dflash_eval/math500_steps3.json b/idea1/results/dflash_eval/math500_steps3.json new file mode 100644 index 0000000000000000000000000000000000000000..3494a17013ca32e6acf3e0350c668721392446a5 --- /dev/null +++ b/idea1/results/dflash_eval/math500_steps3.json @@ -0,0 +1,105 @@ +{ + "config": { + "target_model": "/workspace/models/Qwen3-8B", + "draft_model": "/workspace/models/Qwen3-8B-DFlash-b16", + "dataset": "math500", + "max_samples": 2, + "max_new_tokens": 64, + "num_denoise_steps": 3, + "temperature": 0.0, + "num_gpus": 1 + }, + "results": { + "avg_tau": 1.6329113924050633, + "median_tau": 1, + "per_sample_tau": [ + 1.5609756097560976, + 1.7105263157894737 + ], + "total_blocks": 79, + "total_tokens": 128, + "total_gpu_time": 3.884273052215576, + "wall_clock_time": 2.294543743133545, + "all_acceptance_lengths": [ + 1, + 1, + 1, + 1, + 1, + 1, + 3, + 1, + 2, + 3, + 3, + 3, + 1, + 1, + 1, + 2, + 1, + 1, + 1, + 1, + 2, + 1, + 2, + 1, + 1, + 1, + 1, + 1, + 2, + 1, + 1, + 2, + 3, + 3, + 1, + 1, + 4, + 1, + 1, + 3, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 1, + 3, + 4, + 4, + 4, + 1, + 2, + 3, + 1, + 1, + 1, + 1, + 4, + 2, + 2, + 1, + 1, + 1, + 1, + 3, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 1, + 3, + 3 + ] + } +} \ No newline at end of file diff --git a/idea1/results/dflash_eval/math500_steps3.log b/idea1/results/dflash_eval/math500_steps3.log new file mode 100644 index 0000000000000000000000000000000000000000..a40a2f00fd9b6fd9cfacb3fa5955a259a8226bc0 --- /dev/null +++ b/idea1/results/dflash_eval/math500_steps3.log @@ -0,0 +1,60 @@ +Set TORCH_CUDA_ARCH_LIST to 9.0 +/workspace/hanrui/idea1/specforge/modeling/draft/llama3_eagle.py:29: UserWarning: flash_attn is not found, falling back to flex_attention. Please install flash_attn if you want to use the flash attention backend. + warnings.warn( +:1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +:1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +============================================================ +DFlash Evaluation (Multi-GPU Data Parallel) +============================================================ + Target model: /workspace/models/Qwen3-8B + Draft model: /workspace/models/Qwen3-8B-DFlash-b16 + Dataset: math500 + Max samples: 2 + Max new tokens: 64 + Denoise steps: 3 + Temperature: 0.0 + GPUs: 1 + Dtype: bfloat16 +============================================================ + +[1/4] Loading tokenizer... +[2/4] Loading target model on 1 GPUs... +`torch_dtype` is deprecated! Use `dtype` instead! + Loading checkpoint shards: 0%| | 0/5 [00:00 Okay, so I need to convert the rectangular coordinates (0, 3) to polar c... + [GPU 0] Sample 2/2 | tokens=64 | tau=1.71 | time=1.6s | Okay, so I need to find a way to express the double sum $\sum_{j = 1}^\i... + +============================================================ +RESULTS SUMMARY +============================================================ + Denoise steps: 3 + GPUs used: 1 + Samples evaluated: 2 + Total blocks: 79 + Total generated tokens: 128 + Total GPU-time: 3.88s + Wall-clock time (approx): 2.29s + --- + Avg acceptance length (tau): 1.63 + Median acceptance length: 1.0 + Per-sample avg tau: ['1.56', '1.71'] + Min per-sample tau: 1.56 + Max per-sample tau: 1.71 +============================================================ + +Results saved to /workspace/hanrui/idea1/results/dflash_eval/math500_steps3.json diff --git a/idea1/scripts/eval_dflash.py b/idea1/scripts/eval_dflash.py new file mode 100644 index 0000000000000000000000000000000000000000..2e02d24ff529fbba31c9f8a966b686bea53d4377 --- /dev/null +++ b/idea1/scripts/eval_dflash.py @@ -0,0 +1,383 @@ +""" +DFlash evaluation script: measure acceptance length (tau) with optional multi-step denoising. +Supports multi-GPU data parallelism via torchrun. + +Usage: + # Single GPU + python scripts/eval_dflash.py \ + --target-model-path /workspace/models/Qwen3-8B \ + --draft-model-path /workspace/models/Qwen3-8B-DFlash-b16 \ + --dataset math500 --max-samples 10 --num-denoise-steps 1 + + # 8 GPU (data parallel, each GPU runs a subset of samples) + torchrun --standalone --nproc_per_node 8 scripts/eval_dflash.py \ + --target-model-path /workspace/models/Qwen3-8B \ + --draft-model-path /workspace/models/Qwen3-8B-DFlash-b16 \ + --dataset math500 --max-samples 500 --num-denoise-steps 2 +""" + +import argparse +import json +import os +import sys +import time +from statistics import mean, median + +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +# Add project root to path +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +ROOT_DIR = os.path.dirname(SCRIPT_DIR) +sys.path.insert(0, ROOT_DIR) + +from specforge.modeling.draft.dflash import DFlashDraftModel + + +def parse_args(): + parser = argparse.ArgumentParser(description="DFlash evaluation: acceptance length") + parser.add_argument("--target-model-path", type=str, required=True, + help="Path to target model (e.g. /workspace/models/Qwen3-8B)") + parser.add_argument("--draft-model-path", type=str, required=True, + help="Path to DFlash draft model (e.g. /workspace/models/Qwen3-8B-DFlash-b16)") + parser.add_argument("--dataset", type=str, default="math500", + choices=["math500", "gsm8k", "custom"], + help="Evaluation dataset") + parser.add_argument("--custom-data-path", type=str, default=None, + help="Path to custom jsonl data (when --dataset=custom)") + parser.add_argument("--max-samples", type=int, default=10, + help="Max number of evaluation samples (total across all GPUs)") + parser.add_argument("--max-new-tokens", type=int, default=512, + help="Max new tokens per sample") + parser.add_argument("--num-denoise-steps", type=int, default=1, + help="Number of denoising steps (1=baseline, 2/3=multi-step)") + parser.add_argument("--temperature", type=float, default=0.0, + help="Sampling temperature (0.0=greedy)") + parser.add_argument("--dtype", type=str, default="bfloat16", + choices=["float16", "bfloat16", "float32"]) + parser.add_argument("--output-file", type=str, default=None, + help="Save results to JSON file") + return parser.parse_args() + + +def get_rank_and_world(): + """Get distributed rank and world size, or (0, 1) for single GPU.""" + rank = int(os.environ.get("LOCAL_RANK", os.environ.get("RANK", 0))) + world = int(os.environ.get("WORLD_SIZE", 1)) + return rank, world + + +def load_eval_data(args): + """Load evaluation prompts as list of strings (offline, from local cache).""" + prompts = [] + + # Use local cached datasets (no network access needed) + DATASET_CACHE = "/workspace/hanrui/datasets" + + if args.dataset == "math500": + dataset = load_dataset("HuggingFaceH4/MATH-500", cache_dir=DATASET_CACHE)["test"] + for idx, item in enumerate(dataset): + if idx >= args.max_samples: + break + prompts.append(item["problem"]) + + elif args.dataset == "gsm8k": + dataset = load_dataset("openai/gsm8k", "main", cache_dir=DATASET_CACHE)["test"] + for idx, item in enumerate(dataset): + if idx >= args.max_samples: + break + prompts.append(item["question"]) + + elif args.dataset == "custom": + assert args.custom_data_path is not None, "Need --custom-data-path for custom dataset" + with open(args.custom_data_path, "r") as f: + for idx, line in enumerate(f): + if idx >= args.max_samples: + break + data = json.loads(line.strip()) + if "prompt" in data: + prompts.append(data["prompt"]) + elif "conversations" in data: + for msg in data["conversations"]: + if msg["role"] == "user": + prompts.append(msg["content"]) + break + + return prompts + + +def load_draft_model(draft_model_path, torch_dtype, device): + """Load draft model weights into OUR DFlashDraftModel (with multi-step denoising).""" + from transformers import AutoConfig + from safetensors.torch import load_file as load_safetensors + import glob as glob_module + + draft_config = AutoConfig.from_pretrained(draft_model_path, trust_remote_code=True) + draft = DFlashDraftModel(draft_config).to(torch_dtype) + + safetensors_files = sorted(glob_module.glob(os.path.join(draft_model_path, "*.safetensors"))) + bin_files = sorted(glob_module.glob(os.path.join(draft_model_path, "*.bin"))) + + state_dict = {} + if safetensors_files: + for f in safetensors_files: + state_dict.update(load_safetensors(f, device="cpu")) + elif bin_files: + for f in bin_files: + state_dict.update(torch.load(f, map_location="cpu", weights_only=True)) + else: + raise FileNotFoundError(f"No safetensors or bin files found in {draft_model_path}") + + missing, unexpected = draft.load_state_dict(state_dict, strict=False) + if missing: + print(f" [rank {device}] WARNING: missing keys: {missing}") + if unexpected: + print(f" [rank {device}] INFO: unexpected keys (ignored): {unexpected}") + + return draft.to(device).eval() + + +def run_eval_on_samples(draft, target, tokenizer, prompts, args, device, rank): + """Run evaluation on a list of prompts, return results.""" + stop_token_ids = [tokenizer.eos_token_id] + if hasattr(tokenizer, "additional_special_tokens_ids"): + stop_token_ids.extend(tokenizer.additional_special_tokens_ids[:3]) + + all_acceptance_lengths = [] + all_taus = [] + all_times = [] + all_generated_tokens = [] + + for i, prompt in enumerate(prompts): + # Tokenize with chat template + messages = [{"role": "user", "content": prompt}] + try: + input_text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device) + except Exception: + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + + num_input_tokens = input_ids.shape[1] + + # Generate with speculative decoding + t0 = time.time() + output_ids, acceptance_lengths = draft.spec_generate( + target=target, + input_ids=input_ids, + max_new_tokens=args.max_new_tokens, + stop_token_ids=stop_token_ids, + temperature=args.temperature, + num_denoise_steps=args.num_denoise_steps, + ) + t1 = time.time() + + num_output_tokens = output_ids.shape[1] - num_input_tokens + elapsed = t1 - t0 + avg_tau = mean(acceptance_lengths) if acceptance_lengths else 0 + + all_acceptance_lengths.extend(acceptance_lengths) + all_taus.append(avg_tau) + all_times.append(elapsed) + all_generated_tokens.append(num_output_tokens) + + output_text = tokenizer.decode(output_ids[0, num_input_tokens:], skip_special_tokens=True) + preview = output_text[:80].replace("\n", " ") + + print(f" [GPU {rank}] Sample {i+1}/{len(prompts)} | " + f"tokens={num_output_tokens} | tau={avg_tau:.2f} | " + f"time={elapsed:.1f}s | {preview}...") + + return { + "acceptance_lengths": all_acceptance_lengths, + "per_sample_taus": all_taus, + "times": all_times, + "generated_tokens": all_generated_tokens, + } + + +def main(): + args = parse_args() + rank, world_size = get_rank_and_world() + device = f"cuda:{rank}" + torch.cuda.set_device(rank) + + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + torch_dtype = dtype_map[args.dtype] + + if rank == 0: + print("=" * 60) + print("DFlash Evaluation (Multi-GPU Data Parallel)") + print("=" * 60) + print(f" Target model: {args.target_model_path}") + print(f" Draft model: {args.draft_model_path}") + print(f" Dataset: {args.dataset}") + print(f" Max samples: {args.max_samples}") + print(f" Max new tokens: {args.max_new_tokens}") + print(f" Denoise steps: {args.num_denoise_steps}") + print(f" Temperature: {args.temperature}") + print(f" GPUs: {world_size}") + print(f" Dtype: {args.dtype}") + print("=" * 60) + + # ---- Load models (each GPU loads its own copy) ---- + if rank == 0: + print(f"\n[1/4] Loading tokenizer...") + tokenizer = AutoTokenizer.from_pretrained(args.target_model_path, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + if rank == 0: + print(f"[2/4] Loading target model on {world_size} GPUs...") + target = AutoModelForCausalLM.from_pretrained( + args.target_model_path, + torch_dtype=torch_dtype, + trust_remote_code=True, + ).to(device).eval() + + if rank == 0: + print(f"[3/4] Loading draft model on {world_size} GPUs...") + draft = load_draft_model(args.draft_model_path, torch_dtype, device) + + if rank == 0: + print(f" Draft layers: {draft.config.num_hidden_layers}") + print(f" Draft block_size: {draft.block_size}") + print(f" Draft mask_token: {draft.mask_token_id}") + print(f" Draft layer_ids: {draft.target_layer_ids}") + + # ---- Load and split data ---- + if rank == 0: + print(f"[4/4] Loading evaluation data...") + all_prompts = load_eval_data(args) + + # Split prompts across GPUs + my_prompts = all_prompts[rank::world_size] + if rank == 0: + print(f" Total prompts: {len(all_prompts)}, ~{len(my_prompts)} per GPU") + + # ---- Run evaluation ---- + if rank == 0: + print("\n" + "=" * 60) + print("Running evaluation...") + print("=" * 60) + + results = run_eval_on_samples(draft, target, tokenizer, my_prompts, args, device, rank) + + # ---- Gather results from all GPUs ---- + if world_size > 1: + import torch.distributed as dist + if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + + # Save per-rank results to shared filesystem (not /tmp which may not be shared) + gather_dir = "/workspace/hanrui/cache/eval_gather" + os.makedirs(gather_dir, exist_ok=True) + tmp_file = os.path.join(gather_dir, f"rank{rank}.json") + with open(tmp_file, "w") as f: + json.dump(results, f) + + dist.barrier(device_ids=[rank]) + + if rank == 0: + # Aggregate all ranks + all_acceptance_lengths = [] + all_per_sample_taus = [] + all_times = [] + all_generated_tokens = [] + + for r in range(world_size): + rf = os.path.join(gather_dir, f"rank{r}.json") + with open(rf, "r") as f: + rank_results = json.load(f) + all_acceptance_lengths.extend(rank_results["acceptance_lengths"]) + all_per_sample_taus.extend(rank_results["per_sample_taus"]) + all_times.extend(rank_results["times"]) + all_generated_tokens.extend(rank_results["generated_tokens"]) + os.remove(rf) + + results = { + "acceptance_lengths": all_acceptance_lengths, + "per_sample_taus": all_per_sample_taus, + "times": all_times, + "generated_tokens": all_generated_tokens, + } + else: + # Wait for rank 0 to finish reading before removing + dist.barrier(device_ids=[rank]) + if os.path.exists(tmp_file): + os.remove(tmp_file) + + if rank == 0: + dist.barrier(device_ids=[rank]) + + dist.destroy_process_group() + + # ---- Print summary (rank 0 only) ---- + if rank == 0: + acc_lens = results["acceptance_lengths"] + per_sample_taus = results["per_sample_taus"] + total_tokens = sum(results["generated_tokens"]) + total_time = sum(results["times"]) + # wall-clock time is max across GPUs (they run in parallel) + wall_time = max(results["times"]) if results["times"] else 0 + + overall_avg_tau = mean(acc_lens) if acc_lens else 0 + overall_median_tau = median(acc_lens) if acc_lens else 0 + + print("\n" + "=" * 60) + print("RESULTS SUMMARY") + print("=" * 60) + print(f" Denoise steps: {args.num_denoise_steps}") + print(f" GPUs used: {world_size}") + print(f" Samples evaluated: {len(per_sample_taus)}") + print(f" Total blocks: {len(acc_lens)}") + print(f" Total generated tokens: {total_tokens}") + print(f" Total GPU-time: {total_time:.2f}s") + print(f" Wall-clock time (approx): {wall_time:.2f}s") + print(f" ---") + print(f" Avg acceptance length (tau): {overall_avg_tau:.2f}") + print(f" Median acceptance length: {overall_median_tau:.1f}") + print(f" Per-sample avg tau: {[f'{t:.2f}' for t in per_sample_taus]}") + if per_sample_taus: + print(f" Min per-sample tau: {min(per_sample_taus):.2f}") + print(f" Max per-sample tau: {max(per_sample_taus):.2f}") + print("=" * 60) + + # Save results + if args.output_file: + output = { + "config": { + "target_model": args.target_model_path, + "draft_model": args.draft_model_path, + "dataset": args.dataset, + "max_samples": args.max_samples, + "max_new_tokens": args.max_new_tokens, + "num_denoise_steps": args.num_denoise_steps, + "temperature": args.temperature, + "num_gpus": world_size, + }, + "results": { + "avg_tau": overall_avg_tau, + "median_tau": overall_median_tau, + "per_sample_tau": per_sample_taus, + "total_blocks": len(acc_lens), + "total_tokens": total_tokens, + "total_gpu_time": total_time, + "wall_clock_time": wall_time, + "all_acceptance_lengths": acc_lens, + }, + } + os.makedirs(os.path.dirname(args.output_file) or ".", exist_ok=True) + with open(args.output_file, "w") as f: + json.dump(output, f, indent=2) + print(f"\nResults saved to {args.output_file}") + + +if __name__ == "__main__": + main() diff --git a/idea1/scripts/prepare_data.py b/idea1/scripts/prepare_data.py new file mode 100644 index 0000000000000000000000000000000000000000..9f2c7b24901188535f3944aa7f3ae57fa14b5f92 --- /dev/null +++ b/idea1/scripts/prepare_data.py @@ -0,0 +1,679 @@ +import argparse +import json +import os +import random +import subprocess +from pathlib import Path +from typing import Dict, Tuple + +from tqdm import tqdm + +from datasets import concatenate_datasets, config, load_dataset + +""" +This script will convert the ultrachat/sharegpt dataset to the following schema in jsonl format: +{ + "id": str, + "conversations": [ + { + "role": str, + "content": str + } + ], +} +""" + +ROLE_MAPPING = { + "human": "user", + "gpt": "assistant", + "chatgpt": "assistant", + "bing": "assistant", + "bard": "assistant", +} + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset", + type=str, + choices=[ + "ultrachat", + "sharegpt", + "eaglechat", + "perfectblend", + "perfectblend-llama3.1-8b-instruct", + "perfectblend-llama3.3-70b-instruct", + "perfectblend-llama4-scout-instruct", + "perfectblend-llama4-maverick-instruct", + "magpie-qwen2.5-pro-1m-v0.1", + "sharegpt4v", + "allava4v", + "opc", + "gsm8k", + "hendrycks_math", + "math_qa", + "codealpaca-20k", + "opencodeinstruct", + "magicoder-evol-instruct", + "sciq", + "camel", + ], + help="The demo dataset to quickly run the training for speculative decoding", + ) + parser.add_argument( + "--output-path", + type=str, + default=None, + help="The path to save the processed dataset, if not specified, the dataset will be saved in the cache/dataset/dataset_name directory of the root path", + ) + parser.add_argument( + "--data-path", + type=str, + default=None, + help="The path to the custom dataset, if not specified, the default dataset will be loaded", + ) + parser.add_argument( + "--sample-size", + type=int, + default=None, + help="The number of samples to process from the dataset, if not specified, all samples will be processed", + ) + parser.add_argument( + "--split-eval", + action="store_true", + help="Whether to split the dataset into train and eval sets, default is False", + ) + parser.add_argument( + "--opc-subset", + type=str, + default="largescale_diverse_instruct", + choices=[ + "largescale_diverse_instruct", + "filtered_infinity_instruct", + "realuser_instruct", + "all", + ], + help="The subset of OpenCoder opc-sft-stage1 dataset to use, or 'all' to use all subsets (default: largescale_diverse_instruct)", + ) + return parser.parse_args() + + +def get_cache_dir(dataset_name): + cache_dir = None + if dataset_name == "sharegpt4v": + raise ValueError("Downloading 'sharegpt4v' is not supported.") + elif dataset_name == "allava4v": + cache_dir = os.path.join( + config.HF_DATASETS_CACHE, "FreedomIntelligence", "ALLaVA" + ) + else: + raise ValueError( + f"Dataset '{dataset_name}' is not a supported VLM dataset for download." + ) + return cache_dir + + +def download_vlm_dataset(dataset_name: str) -> None: + """Download VLM's dataset such as sharegpt4v and allava4v""" + if dataset_name == "sharegpt4v": + raise Exception("Don't Support Download sharegpt4v.") + elif dataset_name == "allava4v": + cache_dir = get_cache_dir(dataset_name) + os.makedirs(cache_dir, exist_ok=True) + script_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "datasets", + "download_laion.sh", + ) + os.chmod(script_path, 0o755) + if not os.path.exists( + os.path.join(cache_dir, "allava_laion", "image_chunks", "images_0.zip") + ): + result = subprocess.run( + ["bash", script_path], + cwd=cache_dir, + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"Download image dataset failed: {result.stderr}") + print("##### allava4v dataset Download Complete #####") + else: + print("##### allava4v dataset has existed.") + else: + raise Exception(f"Don't support {dataset_name}") + + +def process_ultrachat_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the ultrachat dataset. + + The function expects a row with the following schema: + "messages": [ + { + "role": "user" | "assistant", + "content": str + } + ] + """ + conversations = row["messages"] + formatted_conversations = [] + for message in conversations: + role = message["role"] + content = message["content"] + assert role in ["user", "assistant"] + formatted_conversations.append({"role": role, "content": content}) + row = {"id": row["prompt_id"], "conversations": formatted_conversations} + return row, 0 + + +def process_sharegpt_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """ + sharegpt dataset schema: + { + "conversations": [ + { + "from": , + "value": , + }, + ... + ] + } + """ + conversations = row["conversations"] + formatted_conversations = [] + skipped_count = 0 + for message in conversations: + if message["from"] not in ROLE_MAPPING: + skipped_count += 1 + continue + new_role = ROLE_MAPPING[message["from"]] + content = message["value"] + formatted_conversations.append({"role": new_role, "content": content}) + + row = {"id": row["id"], "conversations": formatted_conversations} + return row, skipped_count + + +def process_sharegpt4v_row(row, dataset_name: str = None) -> Dict: + """ + sharegpt4v dataset schema: + { + "id": str, + "image": str, # path to the image + "conversations": [ + { + "from": , + "value": , + }, + ... + ] + } + """ + cache_dir = get_cache_dir(dataset_name) + conversations = row["conversations"] + image = os.path.join(cache_dir, row["image"]) + if not os.path.exists(image): + print(f"Image path {image} does not exist, skipping this sample.") + return None, None + formatted_conversations = [] + skipped_count = 0 + for message in conversations: + if message["from"] not in ROLE_MAPPING: + skipped_count += 1 + continue + new_role = ROLE_MAPPING[message["from"]] + if new_role == "user": + text_content = message["value"].replace("\n", "") + content = text_content + else: + content = message["value"] + formatted_conversations.append({"role": new_role, "content": content}) + + row = {"id": row["id"], "image": image, "conversations": formatted_conversations} + return row, skipped_count + + +def load_dataset_from_path(data_path: Path): + suffix = data_path.suffix.split(".")[1] + ds = load_dataset(suffix, data_files=str(data_path), split="train") + return ds + + +def process_and_save_ds(train_ds, test_ds, output_path, proc_fn, dataset_name): + train_output_jsonl_path = output_path.joinpath(f"{dataset_name}_train.jsonl") + if train_output_jsonl_path.exists(): + print( + f"The dataset {dataset_name} has already been processed and saved in {train_output_jsonl_path}, skipping..." + ) + return + + total_skipped_count = 0 + with open(train_output_jsonl_path, "w") as f: + for item in tqdm(train_ds, desc=f"Processing {dataset_name} dataset"): + if proc_fn is not None: + row, skipped_count = proc_fn(item, dataset_name) + if row is None: + continue + total_skipped_count += skipped_count + else: + row = item + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + if test_ds is not None: + test_output_jsonl_path = output_path.joinpath(f"{dataset_name}_test.jsonl") + with open(test_output_jsonl_path, "w") as f: + for item in tqdm(test_ds, desc=f"Processing {dataset_name} test dataset"): + if proc_fn is not None: + row, skipped_count = proc_fn(item, dataset_name) + if row is None: + continue + total_skipped_count += skipped_count + else: + row = item + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + if total_skipped_count > 0: + total_messages = len(train_ds) + (len(test_ds) if test_ds is not None else 0) + print( + f"Skipped {total_skipped_count}/{total_messages} messages for {dataset_name}" + ) + + +import hashlib + + +def process_opc_sft_stage1(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + row_id = hashlib.md5((row["instruction"] + row["output"]).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": row["instruction"]}, + {"role": "assistant", "content": row["output"]}, + ], + } + return processed_row, 0 + + +def process_codealpaca_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the CodeAlpaca-20k dataset. + + The function expects a row with the following schema: + { + "instruction": str, + "input": str, + "output": str + } + """ + row_id = hashlib.md5((row["instruction"] + row["output"]).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": row["instruction"]}, + {"role": "assistant", "content": row["output"]}, + ], + } + return processed_row, 0 + + +def process_opencodeinstruct_row( + row: Dict, dataset_name: str = None +) -> Tuple[Dict, int]: + """Process a row from the nvidia/OpenCodeInstruct dataset. + + The function expects a row with the following schema: + { + "id": str, + "input": str, + "output": str, + "domain": str, + "generation_algorithm": str, + "llm_judgement": str, + "unit_tests": str, + "tests_execution_status": str, + "average_test_score": float + } + """ + # Use the existing id if available, otherwise generate one + row_id = row.get("id") + if row_id is None: + row_id = hashlib.md5((row["input"] + row["output"]).encode()).hexdigest() + + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": row["input"]}, + {"role": "assistant", "content": row["output"]}, + ], + } + return processed_row, 0 + + +def process_magicoder_evol_instruct_row( + row: Dict, dataset_name: str = None +) -> Tuple[Dict, int]: + """Process a row from the ise-uiuc/Magicoder-Evol-Instruct-110K dataset. + + The function expects a row with the following schema: + { + "instruction": str, + "response": str + } + """ + row_id = hashlib.md5((row["instruction"] + row["response"]).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": row["instruction"]}, + {"role": "assistant", "content": row["response"]}, + ], + } + return processed_row, 0 + + +def process_gsm8k_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the gsm8k dataset. + + The function expects a row with the following schema: + { + "question": str, + "answer": str + } + """ + row_id = hashlib.md5((row["question"] + row["answer"]).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": row["question"]}, + {"role": "assistant", "content": row["answer"]}, + ], + } + return processed_row, 0 + + +def process_hendrycks_math_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the hendrycks_math dataset. + + The function expects a row with the following schema: + { + "problem": str, + "solution": str, + "level": str, + "type": str + } + """ + row_id = hashlib.md5((row["problem"] + row["solution"]).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": row["problem"]}, + {"role": "assistant", "content": row["solution"]}, + ], + } + return processed_row, 0 + + +def process_math_qa_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the allenai/math_qa dataset. + + The function expects a row with the following schema: + { + "Problem": str, + "Rationale": str, + "options": str, # format: "a) option1 b) option2 c) option3 d) option4" + "correct": str, + "annotated_formula": str, + "linear_formula": str, + "category": str + } + """ + # Combine Problem and options as user input + problem = row["Problem"] + options = row["options"] + user_content = f"{problem}\n{options}" + + # Use Rationale as assistant response + rationale = row["Rationale"] + + row_id = hashlib.md5((user_content + rationale).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": user_content}, + {"role": "assistant", "content": rationale}, + ], + } + return processed_row, 0 + + +def process_sciq_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the allenai/sciq dataset. + + The function expects a row with the following schema: + { + "question": str, + "distractor3": str, + "distractor1": str, + "distractor2": str, + "correct_answer": str, + "support": str + } + """ + question = row["question"] + correct_answer = row["correct_answer"] + distractor1 = row["distractor1"] + distractor2 = row["distractor2"] + distractor3 = row["distractor3"] + support = row["support"] + + # Create a list of all answers and randomly shuffle them + answers_list = [distractor3, distractor1, distractor2, correct_answer] + random.shuffle(answers_list) + + # Assign shuffled answers to labels a, b, c, d + labels = ["a", "b", "c", "d"] + options_list = [(labels[i], answers_list[i]) for i in range(4)] + + # Find the correct answer label after shuffling + correct_label = None + for label, answer in options_list: + if answer == correct_answer: + correct_label = label + break + + # Format options as a string + options_text = "\n".join([f"{label}) {answer}" for label, answer in options_list]) + user_content = f"{question}\n{options_text}" + + # Combine support with answer + assistant_content = f"{support}\nanswer: {correct_label}) {correct_answer}" + + row_id = hashlib.md5((user_content + assistant_content).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": user_content}, + {"role": "assistant", "content": assistant_content}, + ], + } + return processed_row, 0 + + +def process_camel_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]: + """Process a row from the camel-ai dataset. + + The function expects a row with the following schema: + { + "message_1": str, # user message + "message_2": str, # assistant message + } + """ + message_1 = row["message_1"] + message_2 = row["message_2"] + + row_id = hashlib.md5((message_1 + message_2).encode()).hexdigest() + processed_row = { + "id": row_id, + "conversations": [ + {"role": "user", "content": message_1}, + {"role": "assistant", "content": message_2}, + ], + } + return processed_row, 0 + + +def add_index(row, idx) -> Dict: + row["id"] = idx + return row + + +def main(): + args = parse_args() + # load dataset + if args.dataset == "ultrachat": + ds = load_dataset("HuggingFaceH4/ultrachat_200k")["train_sft"] + proc_fn = process_ultrachat_row + elif args.dataset == "sharegpt": + if args.data_path is None: + ds = load_dataset("Aeala/ShareGPT_Vicuna_unfiltered")["train"] + else: + print("Loading dataset from custom data path: ", args.data_path) + ds = load_dataset_from_path(Path(args.data_path)) + proc_fn = process_sharegpt_row + elif args.dataset == "eaglechat": + ds = load_dataset("zhaode/EagleChat")["train"] + proc_fn = lambda row, name: (row, 0) + elif args.dataset == "perfectblend": + ds = load_dataset("mlabonne/open-perfectblend")["train"] + ds = ds.map(add_index, with_indices=True) + proc_fn = process_sharegpt_row + elif args.dataset == "perfectblend-llama3.1-8b-instruct": + ds = load_dataset("frankleeeee/PerfectBlend-Regenerated-Llama-3.1-8B-Instruct")[ + "train" + ] + ds = ds.map(add_index, with_indices=True) + proc_fn = None + elif args.dataset == "perfectblend-llama3.3-70b-instruct": + ds = load_dataset( + "frankleeeee/PerfectBlend-Regenerated-Llama-3.3-70B-Instruct" + )["train"] + ds = ds.map(add_index, with_indices=True) + proc_fn = None + elif args.dataset == "perfectblend-llama4-scout-instruct": + ds = load_dataset( + "frankleeeee/PerfectBlend-Regenerated-Llama-4-Scout-17B-16E-Instruct" + )["train"] + ds = ds.map(add_index, with_indices=True) + proc_fn = None + elif args.dataset == "perfectblend-llama4-maverick-instruct": + ds = load_dataset( + "frankleeeee/PerfectBlend-Regenerated-Llama-4-Maverick-17B-128E-Instruct" + )["train"] + ds = ds.map(add_index, with_indices=True) + proc_fn = None + elif args.dataset == "magpie-qwen2.5-pro-1m-v0.1": + ds = load_dataset("Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1")["train"] + ds = ds.rename_column("uuid", "id") + proc_fn = process_sharegpt_row + elif args.dataset == "sharegpt4v": + ds = load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")["train"] + raise Exception("Not supported sharegpt4v now") + download_vlm_dataset(args.dataset) + proc_fn = process_sharegpt4v_row + elif args.dataset == "allava4v": + ds = load_dataset("FreedomIntelligence/ALLaVA-4V", name="allava_laion")[ + "instruct" + ] + download_vlm_dataset(args.dataset) + proc_fn = process_sharegpt4v_row + elif args.dataset == "opc": + if args.opc_subset == "all": + # Load all subsets and concatenate them + subsets = [ + "largescale_diverse_instruct", + "filtered_infinity_instruct", + "realuser_instruct", + ] + datasets_list = [ + load_dataset("OpenCoder-LLM/opc-sft-stage1", subset)["train"] + for subset in subsets + ] + ds = concatenate_datasets(datasets_list) + else: + ds = load_dataset("OpenCoder-LLM/opc-sft-stage1", args.opc_subset)["train"] + proc_fn = process_opc_sft_stage1 + elif args.dataset == "gsm8k": + ds = load_dataset("openai/gsm8k", "main")["train"] + proc_fn = process_gsm8k_row + elif args.dataset == "hendrycks_math": + # Load all subjects and concatenate them + subjects = [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus", + ] + datasets_list = [ + load_dataset("EleutherAI/hendrycks_math", subject)["train"] + for subject in subjects + ] + ds = concatenate_datasets(datasets_list) + proc_fn = process_hendrycks_math_row + elif args.dataset == "math_qa": + ds = load_dataset("allenai/math_qa", trust_remote_code=True)["train"] + proc_fn = process_math_qa_row + elif args.dataset == "codealpaca-20k": + ds = load_dataset("sahil2801/CodeAlpaca-20k", trust_remote_code=True)["train"] + proc_fn = process_codealpaca_row + elif args.dataset == "opencodeinstruct": + ds = load_dataset("nvidia/OpenCodeInstruct", trust_remote_code=True)["train"] + proc_fn = process_opencodeinstruct_row + elif args.dataset == "magicoder-evol-instruct": + ds = load_dataset( + "ise-uiuc/Magicoder-Evol-Instruct-110K", trust_remote_code=True + )["train"] + proc_fn = process_magicoder_evol_instruct_row + elif args.dataset == "sciq": + ds = load_dataset("allenai/sciq", trust_remote_code=True)["train"] + proc_fn = process_sciq_row + elif args.dataset == "camel": + # Load all three camel-ai datasets and concatenate them + camel_datasets = [ + load_dataset("camel-ai/biology", split="train"), + load_dataset("camel-ai/chemistry", split="train"), + load_dataset("camel-ai/physics", split="train"), + ] + ds = concatenate_datasets(camel_datasets) + proc_fn = process_camel_row + else: + raise ValueError( + f"This script only supports ultrachat, sharegpt, sharegpt4v, allava4v, opc, gsm8k, hendrycks_math, math_qa, codealpaca-20k, opencodeinstruct, magicoder-evol-instruct, sciq, camel, and perfect-blend-gptoss-20B datasets for demo purpose, if you wish to use other datasets, please modify this script." + ) + # filter and split dataset + if args.sample_size is not None and args.sample_size < len(ds): + ds = ds.select(range(args.sample_size)) + print(f"Processing {args.sample_size} samples from the dataset {args.dataset}") + if args.split_eval: + ds = ds.train_test_split(test_size=0.05) + train_ds = ds["train"] + test_ds = ds["test"] + else: + train_ds = ds + test_ds = None + + if args.output_path is None: + root_path = Path(__file__).parent.parent + output_path = root_path.joinpath("cache", "dataset") + output_path.mkdir(parents=True, exist_ok=True) + else: + output_path = Path(args.output_path) + output_path.mkdir(parents=True, exist_ok=True) + + process_and_save_ds(train_ds, test_ds, output_path, proc_fn, args.dataset) + + +if __name__ == "__main__": + main() diff --git a/idea1/scripts/prepare_hidden_states.py b/idea1/scripts/prepare_hidden_states.py new file mode 100644 index 0000000000000000000000000000000000000000..30ce919422d77607ae5ead658dc8cfaeb70282e2 --- /dev/null +++ b/idea1/scripts/prepare_hidden_states.py @@ -0,0 +1,720 @@ +""" +This script will generate the hidden states for the dataset use transformer as the target model backend. +By generating hidden states in advance, we can avoid: +- the memory overhead of loading target model +- the latency overhead of generating hidden states for each request. + +Optimized for lower memory usage and higher efficiency. + +Usage: +torchrun --nproc_per_node=8 \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --enable-aux-hidden-states \ + --data-path ./cache/dataset/sharegpt_train.jsonl \ + --output-path ./cache/hidden_states/sharegpt_train_Llama-3.1-8B-Instruct \ + --chat-template llama3 \ + --max-length 2048 \ + --tp-size 1 \ + --batch-size 32 \ + --num-samples 1000 \ + --output-path ./cache/hidden_states + +For pre-formatted data (with chat template already applied), add --is-preformatted: +torchrun --nproc_per_node=8 \ + scripts/prepare_hidden_states.py \ + --target-model-path meta-llama/Llama-3.1-8B-Instruct \ + --enable-aux-hidden-states \ + --data-path ./cache/dataset/preformatted_data.jsonl \ + --output-path ./cache/hidden_states \ + --chat-template llama3 \ + --is-preformatted \ + --max-length 2048 +""" + +import argparse +import gc +import gzip +import hashlib +import os +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import List, Optional, Tuple + +import torch +import torch.distributed as dist +from tqdm import tqdm +from transformers import AutoConfig, AutoProcessor, AutoTokenizer + +from datasets import Dataset +from specforge.args import SGLangBackendArgs +from specforge.data import build_eagle3_dataset, prepare_dp_dataloaders +from specforge.distributed import ( + destroy_distributed, + get_dp_group, + get_tp_group, + init_distributed, + is_tp_rank_0, +) +from specforge.modeling.target import Eagle3TargetModel, get_eagle3_target_model +from specforge.utils import ( + print_args_with_dots, + print_with_rank, + rank_0_priority, + safe_conversations_generator, +) + + +@dataclass +class DataPoint: + input_ids: torch.Tensor + loss_mask: torch.Tensor + hidden_state: torch.Tensor + aux_hidden_state: Optional[torch.Tensor] = None + + +def parse_args(): + parser = argparse.ArgumentParser() + + # model-related arguments + model_group = parser.add_argument_group("model") + model_group.add_argument("--target-model-path", type=str, required=True) + model_group.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code when loading models", + ) + model_group.add_argument( + "--is-vlm", action="store_true", help="Whether the target model is a VLM" + ) + model_group.add_argument("--enable-aux-hidden-states", action="store_true") + model_group.add_argument("--aux-hidden-states-layers", type=str, default=None) + + data_group = parser.add_argument_group("data") + data_group.add_argument("--data-path", type=str, required=True) + data_group.add_argument("--max-length", type=int, default=2048) + data_group.add_argument("--chat-template", type=str, default="llama3") + data_group.add_argument( + "--is-preformatted", + action="store_true", + help="Whether the input data is preformatted text with the chat template already applied to the conversation messages.", + ) + data_group.add_argument("--num-samples", type=int, default=None) + data_group.add_argument("--build-dataset-num-proc", type=int, default=8) + + inference_group = parser.add_argument_group("inference") + inference_group.add_argument("--tp-size", type=int, default=1) + inference_group.add_argument("--batch-size", type=int, default=32) + + others_group = parser.add_argument_group("others") + others_group.add_argument("--cache-dir", type=str, default="./cache") + others_group.add_argument("--output-path", type=str, default=None) + others_group.add_argument( + "--model-download-dir", + type=str, + default=None, + help="The directory to download the target model to", + ) + others_group.add_argument( + "--dist-timeout", + type=int, + default=2000, + help="Timeout for collective communication in minutes, default to 2000 so that it does not go timeout", + ) + others_group.add_argument( + "--num-io-threads", + type=int, + default=None, + help="Number of threads for async I/O operations (default: all of CPU cores).", + ) + others_group.add_argument( + "--num-workers", type=int, default=4, help="Number of workers for DataLoader" + ) + others_group.add_argument( + "--io-queue-size", + type=int, + default=50, + help="Max number of pending I/O futures.", + ) + others_group.add_argument( + "--file-group-size", + type=int, + default=2000, + help="Number of files per subdirectory.", + ) + others_group.add_argument( + "--compress", + action="store_true", + help="Compress hidden state files on disk (gzip).", + ) + others_group.add_argument( + "--compression-level", + type=int, + default=6, + help="Gzip compression level (1-9).", + ) + + sglang_group = parser.add_argument_group("sglang") + SGLangBackendArgs.add_args(sglang_group) + return parser.parse_args() + + +def build_target_model( + args: argparse.Namespace, model_config: AutoConfig +) -> Tuple[Eagle3TargetModel, Optional[AutoProcessor]]: + """ + Build the target model according to the arguments. + + For VLM models (Qwen2.5-VL) without TP, load directly from transformers. + Otherwise, use the Eagle3 target model wrapper. + """ + if args.is_vlm and model_config.model_type == "qwen2_5_vl" and args.tp_size == 1: + # TODO: replace with sglang + from transformers import Qwen2_5_VLForConditionalGeneration + + target_model = ( + Qwen2_5_VLForConditionalGeneration.from_pretrained( + pretrained_model_name_or_path=args.target_model_path, + torch_dtype=( + model_config.dtype + if hasattr(model_config, "dtype") + else model_config.torch_dtype + ), + ) + .eval() + .cuda() + ) + else: + target_model_kwargs = SGLangBackendArgs.from_args(args).to_kwargs() + target_model = get_eagle3_target_model( + pretrained_model_name_or_path=args.target_model_path, + backend="sglang", # we set this as the default backend to minimize precision mismatch in training and serving + torch_dtype=( + model_config.dtype + if hasattr(model_config, "dtype") + else model_config.torch_dtype + ), + device="cuda", + cache_dir=args.model_download_dir, + trust_remote_code=args.trust_remote_code, + **target_model_kwargs, + ) + # Set auxiliary hidden states layers if specified + target_model.set_aux_hidden_states_layers(args.aux_hidden_states_layers) + + if args.is_vlm: + processor = AutoProcessor.from_pretrained(args.target_model_path) + else: + processor = None + + return target_model, processor + + +class HiddenStatesGenerator: + """ + This is a generator for creating and saving the hidden states based on the target model. + It includes the following features: + 1. Fixes a potential deadlock in TP > 1 scenarios when a batch is skipped. + 2. Implements a context manager (`with` statement) for robust resource handling. + 3. Makes internal settings (like queue sizes, group sizes) configurable. + 4. Centralizes resource cleanup logic. + """ + + def __init__( + self, + target_model, + enable_aux_hidden_states: bool = True, + num_io_threads: int = 4, + io_queue_size: int = 50, + file_group_size: int = 2000, + compress: bool = False, + compression_level: int = 6, + ): + """ + Args: + target_model: The model for inference. + enable_aux_hidden_states: Whether to save auxiliary hidden states. + num_io_threads: Number of threads for async I/O. + io_queue_size: Max number of pending I/O futures before cleanup. + file_group_size: Number of files per subdirectory. + """ + self.model = target_model + self.enable_aux_hidden_states = enable_aux_hidden_states + + # --- Configurable parameters --- + self.num_io_threads = num_io_threads + self.io_queue_size = io_queue_size + self.file_group_size = file_group_size + self.compress = compress + self.compression_level = compression_level + self.file_extension = ".ckpt.gz" if self.compress else ".ckpt" + + # progress bar should only shown on TP rank = 0 + self.show_progress = dist.get_rank(get_tp_group()) == 0 + + # --- REFACTOR: Thread pool is now managed by __enter__ and __exit__ --- + self.io_executor = None + self.pending_futures = [] + + def __enter__(self): + """Initializes resources when entering a 'with' block.""" + if is_tp_rank_0(): + self.io_executor = ThreadPoolExecutor(max_workers=self.num_io_threads) + self.pending_futures = [] + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Cleans up resources when exiting a 'with' block.""" + if is_tp_rank_0() and self.io_executor is not None: + if self.show_progress: + print("\nWaiting for all async I/O operations to complete...") + self._wait_all_saves() + self.io_executor.shutdown(wait=True) + self.io_executor = None # Reset for safety + + # Final barrier to ensure all processes exit generate() cleanly + dist.barrier() + + def _save_tensor_sync(self, data_point: DataPoint, output_file: str) -> None: + """ + Save a data point to a file synchronously. If there is any NaN value in the data, this datapoint will be skipped. + + Args: + data_point (DataPoint): The data point to save. + output_file (str): The path to the output file. + """ + if data_point.hidden_state is not None and torch.any( + torch.isnan(data_point.hidden_state) + ): + print( + f"Warning: NaN found in hidden_state for {output_file}. Skipping save." + ) + return + + if data_point.aux_hidden_state is not None and torch.any( + torch.isnan(data_point.aux_hidden_state) + ): + print( + f"Warning: NaN found in aux_hidden_state for {output_file}. Skipping save." + ) + return + + if self.compress: + with gzip.open( + output_file, "wb", compresslevel=self.compression_level + ) as f: + torch.save(asdict(data_point), f) + else: + torch.save(asdict(data_point), output_file) + + def _save_tensor_async(self, data_point: DataPoint, output_file: str) -> None: + """ + Submit a job to the io_executor to save the data point asynchronously. + + Args: + data_point (DataPoint): The data point to save. + output_file (str): The path to the output file. + """ + assert is_tp_rank_0(), "Only tp_rank=0 should call _save_tensor_async" + # If the queue of pending save operations is full, we must wait. + if len(self.pending_futures) >= self.io_queue_size: + # First, try to clear any futures that have already finished without waiting. + self.pending_futures = [f for f in self.pending_futures if not f.done()] + # If the queue is *still* full, it means all I/O threads are busy and we have + # a backlog. We must now block the main generation loop and wait for the + # oldest I/O operation to complete before proceeding. + if len(self.pending_futures) >= self.io_queue_size: + self.pending_futures.pop(0).result() + + future = self.io_executor.submit( + self._save_tensor_sync, data_point, output_file + ) + self.pending_futures.append(future) + + def _wait_all_saves(self): + """ + This method is to ensure that all submitted jobs are completed. + """ + if is_tp_rank_0() and self.pending_futures: + for future in tqdm( + self.pending_futures, + desc="Finalizing Writes", + disable=not self.show_progress, + ): + future.result() # Wait and raise exception if any + self.pending_futures.clear() + + def _prepare_output_dirs( + self, output_path: str, start_idx: int, total_samples: int + ) -> None: + """ + The dataset is organized into groups of files, each group has a folder which contains the files for this group. For example, if the + file_group_size is 2000, the 0-1999 samples will be saved in the folder "rows_0-2000", the 2000-3999 samples will be saved in the folder "rows_2000-4000", etc. + + Args: + output_path (str): The path to the output directory. + start_idx (int): The starting index of the samples to save. + total_samples (int): The total number of samples to save. + + Returns: + None + """ + if not is_tp_rank_0() or total_samples == 0: + return + start_group = (start_idx // self.file_group_size) * self.file_group_size + end_sample_idx = start_idx + total_samples - 1 + end_group = (end_sample_idx // self.file_group_size) * self.file_group_size + for group_start_idx in range(start_group, end_group + 1, self.file_group_size): + grouped_subdir = ( + f"rows_{group_start_idx}-{group_start_idx + self.file_group_size}" + ) + output_dir = os.path.join(output_path, grouped_subdir) + os.makedirs(output_dir, exist_ok=True) + + def _check_existing_files_batch( + self, output_path: str, global_indices: List[int] + ) -> List[bool]: + """ + A helper function to check if the files for the given global indices exist. + + Args: + output_path (str): The path to the output directory. + global_indices (List[int]): The global indices of the samples to check. + + Returns: + List[bool]: A list of booleans indicating if the files for the given global indices exist. + """ + if not is_tp_rank_0(): + return [False] * len(global_indices) + + def check_single_file(idx): + if os.path.exists(self._get_file_path(output_path, idx)): + return True + legacy_ckpt = self._get_file_path(output_path, idx, extension=".ckpt") + compressed_ckpt = self._get_file_path( + output_path, idx, extension=".ckpt.gz" + ) + return os.path.exists(legacy_ckpt) or os.path.exists(compressed_ckpt) + + # Parallel file existence check + with ThreadPoolExecutor(max_workers=self.num_io_threads) as executor: + exists = list(executor.map(check_single_file, global_indices)) + return exists + + def _get_file_path( + self, output_path: str, idx: int, extension: Optional[str] = None + ) -> str: + """ + A helper function to get the standard file path for the data point with the given index. + + Args: + output_path (str): The path to the output directory. + idx (int): The global index of the data point. + + Returns: + str: The file path for the data point. + """ + ext = self.file_extension if extension is None else extension + group_idx = (idx // self.file_group_size) * self.file_group_size + grouped_subdir = f"rows_{group_idx}-{group_idx + self.file_group_size}" + return os.path.join(output_path, grouped_subdir, f"data_{idx}{ext}") + + @torch.no_grad() + def generate( + self, + data_loader: torch.utils.data.DataLoader, + output_path: str, + start_idx: int = 0, + samples_per_dp: int = 0, + ): + """ + This version prioritizes minimal CPU RAM usage above all else, even at the cost of performance. + - It processes samples one-by-one within the tp_rank_0 process. + - It avoids batching GPU-to-CPU transfers. + - It ensures only one sample's data is in RAM for I/O at any given time. + """ + self._prepare_output_dirs(output_path, start_idx, samples_per_dp) + + tp_group = get_tp_group() + tp_group_ranks = dist.get_process_group_ranks(tp_group) + tp_rank_0_global = tp_group_ranks[0] + global_idx = start_idx + + progress_bar = tqdm( + data_loader, + disable=(not self.show_progress), + desc="Generating Hidden States", + position=dist.get_rank(get_dp_group()), + leave=True, + ) + + total_skipped, total_processed = 0, 0 + + for batch_idx, batch in enumerate(progress_bar): + batch_size = batch["input_ids"].size(0) + current_batch_indices = list(range(global_idx, global_idx + batch_size)) + + # # Step 1: Synchronize valid indices across TP group + # we check which files already exist and sync this info across TP ranks + # if exists, we will skip these samples + if is_tp_rank_0(): + exists_list = self._check_existing_files_batch( + output_path, current_batch_indices + ) + exists_tensor = torch.tensor( + exists_list, dtype=torch.bool, device="cuda" + ) + else: + exists_tensor = torch.tensor( + [False] * batch_size, dtype=torch.bool, device="cuda" + ) + dist.broadcast(exists_tensor, src=tp_rank_0_global, group=tp_group) + + # Step 1: TP rank 0 checks which samples need processing + valid_indices_in_batch = [ + i for i, exists in enumerate(exists_tensor) if not exists + ] + sample_global_indices = [ + current_batch_indices[i] for i in valid_indices_in_batch + ] + num_valid = len(valid_indices_in_batch) + total_skipped += batch_size - num_valid + + # Step 2: Filter batch before moving to GPU to save memory + global_idx += batch_size + filtered_batch = { + "input_ids": batch["input_ids"][valid_indices_in_batch], + "attention_mask": batch["attention_mask"][valid_indices_in_batch], + "loss_mask": batch["loss_mask"][valid_indices_in_batch], + } + del batch + if num_valid == 0: + # Data has already been generated, no sample processing, update progress bar. + if self.show_progress: + progress_bar.set_postfix( + { + "processed": total_processed, + "skipped": total_skipped, + "pending_io": ( + len(self.pending_futures) if is_tp_rank_0() else 0 + ), + } + ) + continue + + filtered_batch_gpu = { + k: v.cuda(non_blocking=True) for k, v in filtered_batch.items() + } + _, _, aux_hidden_states_list, last_hidden_states_list = self.model.extend( + **filtered_batch_gpu, + return_last_hidden_states=True, + return_logits=False, + ) + + del filtered_batch_gpu + + if is_tp_rank_0(): + for i, ( + current_global_idx, + aux_hidden_states, + last_hidden_states, + ) in enumerate( + zip( + sample_global_indices, + aux_hidden_states_list, + last_hidden_states_list, + ) + ): + + # Process ONE sample at a time to minimize CPU RAM footprint + # 1. Transfer only the required slice for one sample to CPU + aux_hidden_states = ( + aux_hidden_states.cpu().clone().unsqueeze(0) + if aux_hidden_states is not None + else None + ) + last_hidden_states = ( + last_hidden_states.cpu().clone().unsqueeze(0) + if last_hidden_states is not None + else None + ) + data_point = DataPoint( + input_ids=filtered_batch["input_ids"][i].clone(), + loss_mask=filtered_batch["loss_mask"][i].clone(), + hidden_state=last_hidden_states, + aux_hidden_state=aux_hidden_states, + ) + + # 3. Save asynchronously (the backpressure logic is still crucial) + output_file = self._get_file_path(output_path, current_global_idx) + self._save_tensor_async(data_point, output_file) + + # 4. Immediately clean up the single-sample CPU tensors + del last_hidden_states, aux_hidden_states + + total_processed += len(sample_global_indices) + + # Clean up the large GPU and CPU batch data + del aux_hidden_states_list, last_hidden_states_list, filtered_batch + + if batch_idx % 5 == 0: # Make GC and cache clearing more frequent + torch.cuda.empty_cache() + gc.collect() + + if self.show_progress: + progress_bar.set_postfix( + { + "processed": total_processed, + "skipped": total_skipped, + "pending_io": ( + len(self.pending_futures) if is_tp_rank_0() else 0 + ), + } + ) + + if self.show_progress: + print( + f"\nGeneration loop finished. Processed: {total_processed}, Skipped: {total_skipped}" + ) + dist.barrier() + + +def main(): + args = parse_args() + if args.aux_hidden_states_layers is not None: + args.aux_hidden_states_layers = [ + int(x) for x in args.aux_hidden_states_layers.split(",") + ] + if args.num_io_threads is None: + cpu_cores = os.cpu_count() or 1 + args.num_io_threads = max(1, cpu_cores) + # Initialize distributed environment (TP + DP) + init_distributed(timeout=args.dist_timeout, tp_size=args.tp_size) + print_args_with_dots(args) + + # Build target model (with TP) + target_model_config = AutoConfig.from_pretrained( + args.target_model_path, trust_remote_code=args.trust_remote_code + ) + target_model, processor = build_target_model(args, target_model_config) + + print_with_rank( + f"DP Rank {dist.get_rank(get_dp_group())}, TP Rank {dist.get_rank(get_tp_group())}, " + f"DP Size {dist.get_world_size(get_dp_group())}, TP Size {dist.get_world_size(get_tp_group())}" + ) + + if args.output_path is None: + args.output_path = os.path.join( + Path(__file__).parent.parent, "cache", "hidden_states" + ) + + # Load complete dataset + assert os.path.exists( + args.data_path + ), f"Dataset path {args.data_path} does not exist" + + with rank_0_priority(): + print_with_rank("Loading/building dataset cache...") + dataset = Dataset.from_generator( + generator=safe_conversations_generator, + gen_kwargs={"file_path": args.data_path}, + cache_dir=os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "cache", + "hf_dataset", + ), + num_proc=min(args.build_dataset_num_proc, 32), + ) + if args.num_samples is not None: + dataset = dataset.select(range(args.num_samples)) + # Tokenizer and cache key + tokenizer = AutoTokenizer.from_pretrained( + args.target_model_path, trust_remote_code=True + ) + cache_params_string = f"{args.data_path}-{args.max_length}-{args.chat_template}-{args.target_model_path}-{args.num_samples}-{args.is_preformatted}" + cache_key = hashlib.md5(cache_params_string.encode()).hexdigest() + + # Preprocess on complete, un-sharded dataset + with rank_0_priority(): + print_with_rank("Main process is building the dataset cache...") + eagle3_dataset = build_eagle3_dataset( + dataset=dataset, + tokenizer=tokenizer, + chat_template=args.chat_template, + max_length=args.max_length, + cache_dir=os.path.join(args.cache_dir, "processed_dataset"), + cache_key=cache_key, + is_vlm=args.is_vlm, + is_preformatted=args.is_preformatted, + processor=processor, + num_proc=args.build_dataset_num_proc, + ) + print_with_rank(f"Dataset prepared with {len(eagle3_dataset)} samples.") + + # Create DP-sharded dataloader + data_loader = prepare_dp_dataloaders( + dataset=eagle3_dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + shuffle=False, + process_group=get_dp_group(), + is_vlm=args.is_vlm, + ) + + print_with_rank( + f"DataLoader created for DP Rank {dist.get_rank(get_dp_group())}. " + f"Number of batches: {len(data_loader)}" + ) + + # Calculate starting index and sample count for current DP rank + total = len(eagle3_dataset) + dp_rank = dist.get_rank(get_dp_group()) + dp_size = dist.get_world_size(get_dp_group()) + + # Calculate samples per DP rank (handle non-divisible case) + samples_per_dp = total // dp_size + remainder = total % dp_size + + # Earlier ranks handle one extra sample if there's a remainder + if dp_rank < remainder: + samples_per_dp += 1 + start_idx = dp_rank * samples_per_dp + else: + start_idx = dp_rank * samples_per_dp + remainder + + print_with_rank( + f"DP Rank {dp_rank} will process {samples_per_dp} samples, " + f"starting from index {start_idx}" + ) + + # Generate hidden states + try: + # Pass configurable arguments from args if needed + with HiddenStatesGenerator( + target_model, + enable_aux_hidden_states=args.enable_aux_hidden_states, + num_io_threads=args.num_io_threads, + io_queue_size=args.io_queue_size, + file_group_size=args.file_group_size, + compress=args.compress, + compression_level=args.compression_level, + # Other params like io_queue_size can also be added to argparse + ) as hidden_states_generator: + + # Generate hidden states + hidden_states_generator.generate( + data_loader, + output_path=args.output_path, + start_idx=start_idx, + samples_per_dp=samples_per_dp, + ) + + finally: + # The finally block ensures destroy_distributed is always called + print_with_rank("All hidden states generated or job finished.") + destroy_distributed() + + +if __name__ == "__main__": + main() diff --git a/idea1/scripts/regenerate_train_data.py b/idea1/scripts/regenerate_train_data.py new file mode 100644 index 0000000000000000000000000000000000000000..d38392b6982df540e20cff5e1d6961c3612c061d --- /dev/null +++ b/idea1/scripts/regenerate_train_data.py @@ -0,0 +1,456 @@ +""" +This script will re-generate the dataset from target model, +which better aligns the draft model with the target model’s output distribution. + +Usage: +1. Set up one or more SGLang servers for the target model. + +python3 -m sglang.launch_server \ + --model Qwen/Qwen3.5-35B-A3B \ + --mem-fraction-static 0.7 \ + --tp 1 \ + --trust-remote-code \ + --cuda-graph-max-bs 128 \ + --host 0.0.0.0 \ + --port 30000 \ + --dtype bfloat16 \ + --reasoning-parser qwen3 + + +2. Regenerate the dataset using the `regenerate_train_data.py` script. +python scripts/regenerate_train_data.py \ + --model Qwen/Qwen3.5-35B-A3B \ + --concurrency 128 \ + --max-tokens 4096 \ + --server-address localhost:30000 localhost:30010 localhost:30020 localhost:30030 localhost:30040 localhost:30050 localhost:30060 localhost:30070 \ + --temperature 0.8 \ + --input-file-path /data/jiapingW/pr/SpecForge/cache/dataset/opc_train_first_turn.jsonl \ + --output-file-path ./cache/dataset/opc_train_regen_first_turn.jsonl \ + --resume \ + --is-reasoning-model +""" + +import argparse +import json +import os +import random +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, List + +from openai import OpenAI +from tqdm import tqdm + + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser( + description="Re-generate training data using sglang model server" + ) + + # model related arguments + model_group = parser.add_argument_group("model") + model_group.add_argument("--model", type=str, required=True) + model_group.add_argument( + "--is-reasoning-model", + action="store_true", + help="Whether the model is a reasoning model", + ) + model_group.add_argument( + "--is-gpt-oss", + action="store_true", + help="Whether the model is a GPT-OSS model", + ) + + # sampling params + sampling_params_group = parser.add_argument_group("sampling parameters") + sampling_params_group.add_argument( + "--temperature", + type=float, + default=0.7, + help="Temperature for sglang model server", + ) + sampling_params_group.add_argument( + "--top-p", + type=float, + default=None, + help="Nucleus sampling top_p", + ) + sampling_params_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling value sent via extra_body", + ) + sampling_params_group.add_argument( + "--repetition-penalty", + type=float, + default=None, + help="Mapped to presence_penalty in the OpenAI API", + ) + sampling_params_group.add_argument( + "--max-tokens", + type=int, + default=4096, + help="Maximum number of tokens (default: 4096)", + ) + + # optimization + optimization_group = parser.add_argument_group("optimization") + optimization_group.add_argument( + "--concurrency", + type=int, + default=64, + help="The number of requests to send to a single server concurrently, the total number of concurrent requests is concurrency * number of server addresses", + ) + + # data related arguments + data_group = parser.add_argument_group("data") + data_group.add_argument( + "--input-file-path", type=str, required=True, help="Path to the input file" + ) + data_group.add_argument( + "--output-file-path", type=str, required=True, help="Path to the output file" + ) + data_group.add_argument( + "--num-samples", + type=int, + default=None, + help="The number of samples to regenerate, if not provided, all samples will be regenerated", + ) + data_group.add_argument( + "--resume", + action="store_true", + help="Resume from existing output file, skip already processed samples", + ) + + # sglang server + server_group = parser.add_argument_group("sglang server") + server_group.add_argument( + "--server-address", + type=str, + nargs="+", + help="Server address and port for sglang model server", + ) + return parser.parse_args() + + +def get_random_reasoning_effort() -> str: + """Get a random reasoning effort level for the model with weighted probabilities.""" + # usage example: https://huggingface.co/openai/gpt-oss-20b/discussions/28 + # Reasoning effort levels with weights: LOW(4), MEDIUM(4), HIGH(2) + reasoning_efforts = [ + "low", + "medium", + "high", + ] + weights = [4, 4, 2] + return random.choices(reasoning_efforts, weights=weights, k=1)[0] + + +def compute_context_length(conversations: List[Dict[str, Any]]) -> int: + """ + This is a rough estimate of the context length measured in untokenized + tokens. + """ + length = 0 + for message in conversations: + content = message.get("content") + if isinstance(content, str): + # {"role": "assistant", "content": "Hi, how can I help?"} + length += len(content.split()) + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str): + length += len(text.split()) + return length + + +def build_query_kwargs(args, messages, max_tokens=None): + effective_max_tokens = max_tokens if max_tokens is not None else args.max_tokens + + query_kwargs = dict( + model=args.model, + messages=messages, + max_tokens=effective_max_tokens, + temperature=args.temperature, + stream=False, + ) + if args.top_p is not None: + query_kwargs["top_p"] = args.top_p + if args.repetition_penalty is not None: + query_kwargs["presence_penalty"] = args.repetition_penalty + extra_body = {} + if args.top_k is not None: + extra_body["top_k"] = args.top_k + if extra_body: + query_kwargs["extra_body"] = extra_body + if args.is_gpt_oss: + query_kwargs["reasoning_effort"] = get_random_reasoning_effort() + return query_kwargs + + +def call_sglang( + args, + server_address: str, + data: List[Dict[str, Any]], + max_tokens=None, +) -> str: + """Send a batch of prompts to sglang /v1/completions.""" + client = OpenAI(base_url=f"http://{server_address}/v1", api_key="None") + + messages = data["conversations"] + regenerated_messages = [] + + # ignore data which starts with an assistant message + if messages[0]["role"] == "assistant": + data["status"] = "error" + data["error"] = "Data starts with an assistant message" + return data + + for message in messages: + if message["role"] == "system": + regenerated_messages.append(message) + elif message["role"] == "assistant": + continue + elif message["role"] == "user": + regenerated_messages.append(message) + + query_kwargs = build_query_kwargs(args, regenerated_messages, max_tokens) + + try: + resp = client.chat.completions.create(**query_kwargs) + except Exception as e: + data["status"] = "error" + data["error"] = str(e) + return data + response_text = resp.choices[0].message.content + resp_msg = { + "role": "assistant", + "content": response_text, + } + if args.is_reasoning_model: + resp_msg["reasoning_content"] = resp.choices[ + 0 + ].message.reasoning_content + regenerated_messages.append(resp_msg) + else: + data["status"] = "error" + data["error"] = f"Invalid message role: {message['role']}" + return data + data["conversations"] = regenerated_messages + data["status"] = "success" + return data + + +def main(): + # Parse command line arguments + args = parse_arguments() + + # Validate parameters + if not (0.0 <= args.temperature <= 1.0): + raise ValueError("Temperature must be between 0.0 and 1.0") + + if args.max_tokens <= 0: + raise ValueError("Max tokens must be greater than 0") + + print(f"Configuration:") + print(f" Model path: {args.model}") + print(f" Max tokens: {args.max_tokens}") + print(f" Concurrency: {args.concurrency}") + print(f" Temperature: {args.temperature}") + print(f" API URL: {args.server_address}") + print(f" Input file: {args.input_file_path}") + print(f" Output file: {args.output_file_path}") + print(f" Resume mode: {args.resume}") + print("-" * 50) + total_lines = sum(1 for _ in open(args.input_file_path)) + + skip_lines = 0 + error_file_path = args.output_file_path.replace(".jsonl", "_error.jsonl") + + if args.resume and os.path.exists(args.output_file_path): + existing_success = sum(1 for _ in open(args.output_file_path)) + existing_error = 0 + if os.path.exists(error_file_path): + existing_error = sum(1 for _ in open(error_file_path)) + skip_lines = existing_success + existing_error + print(f"Resume mode enabled:") + print(f" Found {existing_success} successful samples in output file") + print(f" Found {existing_error} error samples in error file") + print(f" Skipping first {skip_lines} input samples") + print("-" * 50) + + if skip_lines >= total_lines: + print(f"All {total_lines} samples already processed. Nothing to do.") + return + + # test all server addresses + valid_server_addresses = [] + for server_address in args.server_address: + dummy_data = dict( + conversations=[{"role": "user", "content": "Hello, how are you?"}] + ) + result = call_sglang( + args, + server_address, + dummy_data, + max_tokens=1, + ) + if result is not None: + valid_server_addresses.append(server_address) + else: + print(f"Server {server_address} is not available") + + if len(valid_server_addresses) == 0: + raise ValueError("No server address is available") + print( + f"Using {len(valid_server_addresses)} server addresses: {valid_server_addresses}" + ) + print("-" * 50) + + # Determine file open mode based on resume flag + file_mode = "a" if (args.resume and skip_lines > 0) else "w" + print( + f"Regenerating dataset and saving the output to {args.output_file_path} and error log to {error_file_path}" + ) + print( + f"File open mode: {file_mode} ({'append' if file_mode == 'a' else 'overwrite'})" + ) + print("-" * 50) + context_token_sum = 0 + context_token_min = None + context_token_max = 0 + success_samples = 0 + error_samples = 0 + + # Create progress bar + with ( + open(args.input_file_path, "r") as input_file, + open(args.output_file_path, file_mode) as output_file_handle, + open(error_file_path, file_mode) as error_file_handle, + ): + executor = ThreadPoolExecutor( + max_workers=args.concurrency * len(valid_server_addresses) + ) + waiting_queue = { + server_address: [] for server_address in valid_server_addresses + } + pbar = tqdm(total=total_lines, desc="Processing", initial=skip_lines) + start_server_index = 0 + + if skip_lines > 0: + print(f"Skipping {skip_lines} already processed samples...") + for _ in range(skip_lines): + next(input_file, None) + print(f"Resuming from sample {skip_lines + 1}") + + for line in input_file: + if ( + args.num_samples is not None + and success_samples + error_samples >= args.num_samples + ): + break + + data = json.loads(line.strip()) + + # find server address with the least waiting requests + server_address = valid_server_addresses[start_server_index] + start_server_index = (start_server_index + 1) % len(valid_server_addresses) + + # submit prompt to sglang + while len(waiting_queue[server_address]) >= args.concurrency: + finished_on_request = False + # check if any future is done, if so, write the result to the output file + for req_future in waiting_queue[server_address]: + if req_future.done(): + regen_data = req_future.result() + + if regen_data["status"] == "error": + error_file_handle.write( + json.dumps(regen_data, ensure_ascii=False) + "\n" + ) + error_samples += 1 + else: + ctx_len = compute_context_length( + regen_data.get("conversations", []) + ) + context_token_sum += ctx_len + if context_token_min is None: + context_token_min = ctx_len + else: + context_token_min = min(context_token_min, ctx_len) + context_token_max = max(context_token_max, ctx_len) + + output_file_handle.write( + json.dumps(regen_data, ensure_ascii=False) + "\n" + ) + success_samples += 1 + waiting_queue[server_address].remove(req_future) + finished_on_request = True + + if finished_on_request: + break + + req_future = executor.submit( + call_sglang, + args, + server_address, + data, + ) + waiting_queue[server_address].append(req_future) + pbar.update(1) + + # deal with all the remaining requests + for server_address, waiting_queue_items in waiting_queue.items(): + for req_future in waiting_queue_items: + regen_data = req_future.result() + if regen_data["status"] == "error": + error_file_handle.write( + json.dumps(regen_data, ensure_ascii=False) + "\n" + ) + error_samples += 1 + else: + ctx_len = compute_context_length( + regen_data.get("conversations", []) + ) + context_token_sum += ctx_len + if context_token_min is None: + context_token_min = ctx_len + else: + context_token_min = min(context_token_min, ctx_len) + context_token_max = max(context_token_max, ctx_len) + + output_file_handle.write( + json.dumps(regen_data, ensure_ascii=False) + "\n" + ) + success_samples += 1 + + print(f"\nProcessing completed!") + if success_samples > 0: + avg_len = context_token_sum / success_samples + print("Context length statistics (token count over conversations):") + print(f"Number of successful examples: {success_samples}") + print(f"Shortest context length: {context_token_min}") + print(f"Longest context length: {context_token_max}") + print(f"Average context length: {avg_len:.2f}") + else: + print("No successful examples to compute context length statistics.") + + total_processed = success_samples + error_samples + if skip_lines > 0: + print(f"\nResume processing completed!") + print(f" Previously processed: {skip_lines}") + print( + f" Newly processed: {total_processed} ({success_samples} success, {error_samples} failed)" + ) + print(f" Total: {skip_lines + total_processed}") + else: + print( + f"\nProcessing completed! {success_samples} samples regenerated, {error_samples} samples failed." + ) + + +if __name__ == "__main__": + main() diff --git a/idea1/scripts/train_dflash.py b/idea1/scripts/train_dflash.py new file mode 100644 index 0000000000000000000000000000000000000000..ff3cad90b9a0dd126cb54a8e81d7f4b237cf0560 --- /dev/null +++ b/idea1/scripts/train_dflash.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python3 +# coding=utf-8 +"""DFlash Training Script.""" + +import argparse +import logging +import math +import os +import shutil +import time +import warnings +from typing import Optional, Tuple + +import torch +import torch.distributed as dist +from accelerate.utils import set_seed +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import MixedPrecision, ShardingStrategy, StateDictType +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoConfig, AutoTokenizer + +from datasets import load_dataset +from specforge.args import SGLangBackendArgs, TrackerArgs +from specforge.core.dflash import OnlineDFlashModel +from specforge.data import build_eagle3_dataset, prepare_dp_dataloaders +from specforge.distributed import destroy_distributed, get_dp_group, init_distributed +from specforge.modeling.draft.dflash import DFlashDraftModel +from specforge.modeling.target.dflash_target_model import ( + DFlashTargetModel, + get_dflash_target_model, +) +from specforge.modeling.target.target_utils import TargetEmbeddingsAndHead +from specforge.optimizer import BF16Optimizer +from specforge.tracker import create_tracker +from specforge.utils import get_last_checkpoint, print_on_rank0, print_with_rank + + +def parse_args(): + parser = argparse.ArgumentParser(description="Train DFlash Draft Model") + + model_group = parser.add_argument_group("model") + model_group.add_argument("--target-model-path", type=str, required=True) + model_group.add_argument( + "--target-model-backend", + type=str, + default="hf", + choices=["sglang", "hf"], + help="Backend for target model: 'sglang' (service) or 'hf' (local)", + ) + model_group.add_argument("--draft-config-path", type=str, default=None) + model_group.add_argument("--block-size", type=int, default=16) + model_group.add_argument("--num-draft-layers", type=int, default=1) + model_group.add_argument( + "--mask-token-id", + type=int, + default=None, + help="MASK token ID. If not provided, auto-detect from tokenizer.", + ) + model_group.add_argument( + "--attention-backend", + type=str, + default="flex_attention", + choices=["eager", "sdpa", "flex_attention"], + help="Attention backend for draft model.", + ) + model_group.add_argument( + "--trust-remote-code", action="store_true", help="Trust remote code" + ) + model_group.add_argument( + "--num-anchors", + type=int, + default=512, + help="Number of anchor positions per sequence", + ) + model_group.add_argument( + "--loss-decay-gamma", + type=float, + default=None, + help="Gamma for exponential loss decay weighting (paper Eq.4). " + "Suggested: 7 for block_size=16, 5 for 10, 4 for 8. None disables.", + ) + model_group.add_argument( + "--embedding-key", + type=str, + default=None, + help="Embedding weight key in the target model. " + "Default: 'model.embed_tokens.weight' for standard models, " + "'model.language_model.embed_tokens.weight' for multimodal models like Qwen3.5-A3B.", + ) + model_group.add_argument( + "--lm-head-key", + type=str, + default=None, + help="LM head weight key in the target model. Default: 'lm_head.weight'.", + ) + + dataset_group = parser.add_argument_group("dataset") + dataset_group.add_argument("--train-data-path", type=str, required=True) + dataset_group.add_argument("--eval-data-path", type=str, default=None) + dataset_group.add_argument("--chat-template", type=str, default="qwen") + dataset_group.add_argument("--is-preformatted", action="store_true") + dataset_group.add_argument("--dataloader-num-workers", type=int, default=8) + dataset_group.add_argument( + "--build-dataset-num-proc", + type=int, + default=int(os.environ.get("SPECFORGE_DATA_NUM_PROC", 8)), + ) + + training_group = parser.add_argument_group("training") + training_group.add_argument("--num-epochs", type=int, default=6) + training_group.add_argument("--batch-size", type=int, default=1) + training_group.add_argument("--learning-rate", type=float, default=6e-4) + training_group.add_argument("--max-length", type=int, default=3072) + training_group.add_argument("--warmup-ratio", type=float, default=0.04) + training_group.add_argument("--max-grad-norm", type=float, default=1.0) + training_group.add_argument("--accumulation-steps", type=int, default=1) + training_group.add_argument("--seed", type=int, default=42) + training_group.add_argument("--resume", action="store_true") + + output_group = parser.add_argument_group("output") + output_group.add_argument("--output-dir", type=str, required=True) + output_group.add_argument("--cache-dir", type=str, default="./cache") + output_group.add_argument("--log-interval", type=int, default=50) + output_group.add_argument("--eval-interval", type=int, default=1000) + output_group.add_argument("--save-interval", type=int, default=1000) + + optimization_group = parser.add_argument_group("optimization") + optimization_group.add_argument( + "--tp-size", + type=int, + default=1, + help="The size of the tensor parallel for the target model", + ) + + tracker_group = parser.add_argument_group("tracker") + TrackerArgs.add_args(tracker_group) + + dist_group = parser.add_argument_group("distributed") + dist_group.add_argument("--dist-timeout", type=int, default=30) + + # SGLang specific args + sglang_group = parser.add_argument_group("sglang backend") + SGLangBackendArgs.add_args(sglang_group) + + return parser.parse_args() + + +def build_models(args) -> Tuple[DFlashTargetModel, DFlashDraftModel]: + """Build target model (backend wrapper) and draft model.""" + print_on_rank0( + f"Loading target model from {args.target_model_path} using {args.target_model_backend} backend" + ) + + target_model_kwargs = {} + if args.target_model_backend == "sglang": + target_model_kwargs = SGLangBackendArgs.from_args(args).to_kwargs() + + target_model = get_dflash_target_model( + pretrained_model_name_or_path=args.target_model_path, + backend=args.target_model_backend, + torch_dtype=torch.bfloat16, + device="cuda" if args.target_model_backend == "hf" else None, + trust_remote_code=args.trust_remote_code, + **target_model_kwargs, + ) + + if args.draft_config_path: + draft_config = AutoConfig.from_pretrained(args.draft_config_path) + print_on_rank0(f"Loaded draft config from {args.draft_config_path}") + # Warn if command-line args differ from config + if ( + hasattr(draft_config, "block_size") + and draft_config.block_size != args.block_size + ): + print_on_rank0( + f"Warning: checkpoint block_size ({draft_config.block_size}) differs from " + f"command-line arg ({args.block_size}). Using checkpoint value." + ) + else: + target_config = AutoConfig.from_pretrained(args.target_model_path) + draft_config = AutoConfig.from_pretrained(args.target_model_path) + draft_config.num_hidden_layers = args.num_draft_layers + draft_config.block_size = args.block_size + draft_config.num_target_layers = target_config.num_hidden_layers + print_on_rank0("Auto-generated draft config from target model") + + if not hasattr(draft_config, "dflash_config") or draft_config.dflash_config is None: + draft_config.dflash_config = {} + + draft_config._attn_implementation = args.attention_backend + print_on_rank0(f"Using attention backend: {args.attention_backend}") + + draft_model = DFlashDraftModel(draft_config).cuda().to(torch.bfloat16) + + target_model.set_capture_layers(draft_model.target_layer_ids) + + print_on_rank0( + f"Draft config: block_size={draft_config.block_size}, " + f"num_hidden_layers={draft_config.num_hidden_layers}, " + f"num_target_layers={draft_config.num_target_layers}" + ) + print_on_rank0( + f"Draft model parameters: {sum(p.numel() for p in draft_model.parameters()):,}" + ) + + return target_model, draft_model + + +def build_dataloader(args, tokenizer) -> Tuple[DataLoader, Optional[DataLoader]]: + """Build train and eval dataloaders.""" + import hashlib + + cache_params_string = ( + f"{args.train_data_path}-" + f"{args.max_length}-" + f"{args.chat_template}-" + f"{args.target_model_path}" + ) + cache_key = hashlib.md5(cache_params_string.encode()).hexdigest() + + if os.path.isdir(args.train_data_path): + train_dataset = load_dataset(args.train_data_path, split="train") + else: + train_dataset = load_dataset("json", data_files=args.train_data_path)["train"] + train_eagle3_dataset = build_eagle3_dataset( + dataset=train_dataset, + tokenizer=tokenizer, + chat_template=args.chat_template, + max_length=args.max_length, + is_preformatted=args.is_preformatted, + cache_dir=os.path.join(args.cache_dir, "processed_dataset"), + cache_key=cache_key, + num_proc=args.build_dataset_num_proc, + ) + + min_loss_tokens = 2 * args.block_size + original_size = len(train_eagle3_dataset) + train_eagle3_dataset = train_eagle3_dataset.filter( + lambda x: x["loss_mask"].sum() >= min_loss_tokens + ) + print_on_rank0( + f"Filtered train dataset: {original_size} -> {len(train_eagle3_dataset)} samples" + ) + + train_dataloader = prepare_dp_dataloaders( + train_eagle3_dataset, + args.batch_size, + num_workers=args.dataloader_num_workers, + shuffle=True, + process_group=get_dp_group(), + ) + + eval_dataloader = None + if args.eval_data_path: + eval_dataset = load_dataset("json", data_files=args.eval_data_path)["train"] + eval_eagle3_dataset = build_eagle3_dataset( + dataset=eval_dataset, + tokenizer=tokenizer, + chat_template=args.chat_template, + max_length=args.max_length, + is_preformatted=args.is_preformatted, + ) + eval_dataloader = prepare_dp_dataloaders( + eval_eagle3_dataset, + args.batch_size, + num_workers=args.dataloader_num_workers, + shuffle=False, + process_group=get_dp_group(), + ) + + return train_dataloader, eval_dataloader + + +def save_checkpoint(args, epoch, step, dflash_model, draft_model, optimizer): + """Save checkpoint.""" + save_dir = os.path.join(args.output_dir, f"epoch_{epoch}_step_{step}") + if dist.get_rank() == 0: + os.makedirs(save_dir, exist_ok=True) + dist.barrier() + + with FSDP.state_dict_type(dflash_model, StateDictType.FULL_STATE_DICT): + state_dict = dflash_model.state_dict() + draft_state_dict = { + k.replace("draft_model.", ""): v + for k, v in state_dict.items() + if "draft_model." in k + } + + if dist.get_rank() == 0: + torch.save( + { + "epoch": epoch, + "global_step": step, + "args": args, + **optimizer.state_dict(), + }, + os.path.join(save_dir, "training_state.pt"), + ) + + draft_model.save_pretrained(save_dir, state_dict=draft_state_dict) + + modeling_src = os.path.join( + os.path.dirname(__file__), + "..", + "specforge", + "modeling", + "draft", + "dflash.py", + ) + modeling_dst = os.path.join(save_dir, "dflash.py") + if os.path.exists(modeling_src): + shutil.copy(modeling_src, modeling_dst) + + print_on_rank0(f"Saved checkpoint to {save_dir}") + + dist.barrier() + + +def record_metrics( + args, + loss: float, + accuracy: float, + global_step: int, + tracker, + optimizer, + train_dataloader=None, + mode: str = "train", +) -> None: + logdict = {} + + if mode == "train" and optimizer is not None: + logdict["train/lr"] = optimizer.get_learning_rate() + + logdict[f"{mode}/loss"] = loss + logdict[f"{mode}/accuracy"] = accuracy + + print_on_rank0( + f"{mode.capitalize()} - Step {global_step} [{global_step}/{args.num_epochs * len(train_dataloader) // args.accumulation_steps}?], Loss: {loss:.4f}, Acc: {accuracy:.4f}" + ) + + tracker.log(logdict, step=global_step) + + +def main(): + + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logging.getLogger().setLevel(logging.INFO) + warnings.filterwarnings( + "ignore", + "The .grad attribute of a Tensor that is not a leaf Tensor is being accessed", + ) + + args = parse_args() + set_seed(args.seed) + + init_distributed(timeout=args.dist_timeout, tp_size=args.tp_size) + print_with_rank("Initialized distributed") + + draft_model_last_checkpoint = None + ckpt_info = (0, 0) + if args.resume and os.path.isdir(args.output_dir): + draft_model_last_checkpoint, ckpt_info = get_last_checkpoint(args.output_dir) + print(f"Last checkpoint detected: {draft_model_last_checkpoint}") + + # If resuming, load config from checkpoint to ensure consistency + if draft_model_last_checkpoint: + checkpoint_config_path = os.path.join( + draft_model_last_checkpoint, "config.json" + ) + if os.path.exists(checkpoint_config_path): + print(f"Loading draft config from checkpoint: {checkpoint_config_path}") + args.draft_config_path = checkpoint_config_path + + target_model, draft_model = build_models(args) + + resume_state = None + if draft_model_last_checkpoint: + loaded_model = DFlashDraftModel.from_pretrained( + draft_model_last_checkpoint, torch_dtype=torch.bfloat16 + ) + draft_model.load_state_dict(loaded_model.state_dict()) + del loaded_model + print("Loaded draft model weights from checkpoint") + + training_state_path = os.path.join( + draft_model_last_checkpoint, "training_state.pt" + ) + if os.path.exists(training_state_path): + resume_state = torch.load( + training_state_path, map_location="cpu", weights_only=False + ) + print( + f"Will resume from epoch {resume_state['epoch']}, " + f"step {resume_state['global_step']}" + ) + + tokenizer = AutoTokenizer.from_pretrained(args.target_model_path) + + if args.mask_token_id is not None: + mask_token_id = args.mask_token_id + elif tokenizer.mask_token_id is not None: + mask_token_id = tokenizer.mask_token_id + else: + tokenizer.add_special_tokens({"mask_token": "<|MASK|>"}) + mask_token_id = tokenizer.mask_token_id + print_on_rank0(f"Using mask_token_id: {mask_token_id}") + + draft_model.mask_token_id = mask_token_id + draft_model.config.dflash_config["mask_token_id"] = mask_token_id + draft_model.config.dflash_config["target_layer_ids"] = draft_model.target_layer_ids + print_on_rank0(f"dflash_config: {draft_model.config.dflash_config}") + + train_dataloader, eval_dataloader = build_dataloader(args, tokenizer) + + steps_per_epoch = math.ceil(len(train_dataloader) / args.accumulation_steps) + total_steps = args.num_epochs * steps_per_epoch + print_on_rank0(f"Total training steps: {total_steps}") + + print_on_rank0("Loading target embeddings and head...") + target_components = TargetEmbeddingsAndHead.from_pretrained( + args.target_model_path, + embed_key=args.embedding_key, + lm_head_key=args.lm_head_key, + device="cuda", + trust_remote_code=args.trust_remote_code, + ) + + dflash_model = OnlineDFlashModel( + draft_model=draft_model, + target_lm_head=target_components.lm_head, + target_embed_tokens=target_components.embed_tokens, + block_size=draft_model.block_size, + mask_token_id=mask_token_id, + attention_backend=args.attention_backend, + num_anchors=args.num_anchors, + loss_decay_gamma=args.loss_decay_gamma, + ) + + dflash_model = FSDP( + dflash_model, + use_orig_params=True, + mixed_precision=MixedPrecision( + param_dtype=torch.bfloat16, + buffer_dtype=torch.bfloat16, + ), + sharding_strategy=ShardingStrategy.SHARD_GRAD_OP, + ) + print_with_rank("Initialized FSDP") + + start_epoch = ckpt_info[0] + global_step = ckpt_info[1] + + optimizer = BF16Optimizer( + draft_model, + lr=args.learning_rate, + max_grad_norm=args.max_grad_norm, + warmup_ratio=args.warmup_ratio, + total_steps=total_steps, + ) + + if resume_state is not None: + optimizer.scheduler.load_state_dict(resume_state["scheduler_state_dict"]) + start_epoch = resume_state["epoch"] + global_step = resume_state["global_step"] + del resume_state + print_on_rank0( + f"Restored optimizer/scheduler state: " + f"epoch={start_epoch}, step={global_step}, " + f"lr={optimizer.get_learning_rate():.6f}" + ) + + skip_steps = global_step - start_epoch * len(train_dataloader) + + print_on_rank0(f"Initializing tracker (report_to={args.report_to})...") + tracker = create_tracker(args, args.output_dir) + print_on_rank0("Tracker initialized successfully.") + + last_time = time.time() + print_on_rank0(f"Starting training from epoch {start_epoch}, step {global_step}") + + for epoch in range(start_epoch, args.num_epochs): + train_dataloader.sampler.set_epoch(epoch) + draft_model.train() + + if dist.get_rank() == 0: + progress_bar = tqdm( + train_dataloader, desc=f"Training Epoch {epoch}", leave=True + ) + else: + progress_bar = train_dataloader + + for step_in_epoch, data in enumerate(progress_bar): + if epoch == start_epoch and step_in_epoch < skip_steps: + continue + global_step += 1 + + input_ids = data["input_ids"].cuda() + attention_mask = data["attention_mask"].cuda() + loss_mask = data["loss_mask"].cuda() + target_output = target_model.generate_dflash_data( + input_ids, attention_mask, loss_mask + ) + hidden_states = target_output.hidden_states.cuda() # Ensure on GPU + + loss, accuracy = dflash_model( + input_ids=input_ids, + hidden_states=hidden_states, + loss_mask=loss_mask, + ) + + (loss / args.accumulation_steps).backward() + + if global_step % args.accumulation_steps == 0: + optimizer.step() + + if global_step % args.log_interval == 0: + loss_log = loss.clone() + acc_log = accuracy.clone() + dist.all_reduce(loss_log) + dist.all_reduce(acc_log) + loss_log = loss_log / dist.get_world_size() + acc_log = acc_log / dist.get_world_size() + + record_metrics( + args, + loss_log.item(), + acc_log.item(), + global_step, + tracker, + optimizer, + train_dataloader, + mode="train", + ) + + if dist.get_rank() == 0: + elapsed = time.time() - last_time + last_time = time.time() + progress_bar.set_postfix( + { + "loss": f"{loss.item():.4f}", + "acc": f"{accuracy.item():.4f}", + "iter_time": f"{elapsed:.2f}s", + } + ) + + if global_step % args.save_interval == 0: + save_checkpoint( + args, epoch, global_step, dflash_model, draft_model, optimizer + ) + + save_checkpoint( + args, args.num_epochs, global_step, dflash_model, draft_model, optimizer + ) + + tracker.close() + destroy_distributed() + + +if __name__ == "__main__": + main() diff --git a/idea1/scripts/train_eagle3.py b/idea1/scripts/train_eagle3.py new file mode 100644 index 0000000000000000000000000000000000000000..0bd157b39fa3f49d693693b4aaaed941b5fc2de3 --- /dev/null +++ b/idea1/scripts/train_eagle3.py @@ -0,0 +1,1012 @@ +import argparse +import hashlib +import math +import os +import time +from argparse import ArgumentParser, Namespace +from typing import List, Optional, Tuple, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +from accelerate.utils import set_seed +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import MixedPrecision, ShardingStrategy, StateDictType +from torch.optim import Optimizer +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoProcessor, AutoTokenizer + +from datasets import Dataset +from specforge import ( + AutoDraftModelConfig, + AutoEagle3DraftModel, + OnlineEagle3Model, + QwenVLOnlineEagle3Model, +) +from specforge.args import SGLangBackendArgs, TrackerArgs +from specforge.data import ( + build_eagle3_dataset, + build_offline_eagle3_dataset, + generate_vocab_mapping_file, + prepare_dp_dataloaders, +) +from specforge.distributed import ( + destroy_distributed, + get_dp_group, + get_draft_dp_group, + get_tp_group, + init_distributed, +) +from specforge.modeling.target import ( + Eagle3TargetModel, + TargetHead, + get_eagle3_target_model, +) +from specforge.optimizer import BF16Optimizer +from specforge.tracker import Tracker, create_tracker, get_tracker_class +from specforge.utils import ( + create_draft_config_from_target, + get_last_checkpoint, + print_args_with_dots, + print_on_rank0, + print_with_rank, + rank_0_priority, + safe_conversations_generator, +) + + +def parse_args() -> Tuple[ArgumentParser, Namespace]: + """ + This function is used to parse the arguments for the training script. + """ + parser = argparse.ArgumentParser(description="Train Eagle3 with online data") + + # add model-related arguments + model_group = parser.add_argument_group("model") + model_group.add_argument("--target-model-path", type=str, required=True) + model_group.add_argument( + "--trust-remote-code", action="store_true", help="Trust remote code" + ) + model_group.add_argument( + "--draft-model-config", + type=str, + required=False, + help="Draft model config path. If not provided, will auto-generate from target model.", + ) + model_group.add_argument( + "--embedding-key", + type=str, + default="model.embed_tokens.weight", + help="The key of the embedding weight to load from the target model", + ) + model_group.add_argument( + "--lm-head-key", + type=str, + default="lm_head.weight", + help="The key of the lm head weight to load from the target model, this is only required for offline training", + ) + model_group.add_argument( + "--is-vlm", action="store_true", help="Whether the target model is a VLM" + ) + model_group.add_argument( + "--target-model-backend", + type=str, + default="sglang", + choices=["sglang", "hf", "custom"], + help="The backend of the target model", + ) + + # dataset arguments + dataset_group = parser.add_argument_group("dataset") + dataset_group.add_argument("--train-data-path", type=str, required=True) + dataset_group.add_argument("--train-hidden-states-path", type=str, default=None) + dataset_group.add_argument("--eval-hidden-states-path", type=str, default=None) + dataset_group.add_argument("--eval-data-path", type=str, default=None) + dataset_group.add_argument("--chat-template", type=str, default="llama3") + dataset_group.add_argument( + "--is-preformatted", + action="store_true", + help="Whether the input data is preformatted text with the chat template already applied to the conversation messages.", + ) + dataset_group.add_argument( + "--train-only-last-turn", + action="store_true", + help="If set, only the last assistant turn in each conversation contributes to the loss. " + "Useful for thinking models where conversation history may lack thought processes.", + ) + dataset_group.add_argument("--build-dataset-num-proc", type=int, default=8) + dataset_group.add_argument( + "--dataloader-num-workers", + type=int, + default=4, + help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.", + ) + # training hyper params + training_group = parser.add_argument_group("training") + training_group.add_argument("--num-epochs", type=int, default=10) + training_group.add_argument( + "--max-num-steps", + type=int, + default=None, + help="The maximum number of steps to train. If not provided, will be calculated as num_epochs * steps_per_epoch", + ) + training_group.add_argument("--batch-size", type=int, default=1) + training_group.add_argument("--learning-rate", type=float, default=1e-4) + training_group.add_argument("--max-length", type=int, default=2048) + training_group.add_argument("--warmup-ratio", type=float, default=0.015) + training_group.add_argument( + "--total-steps", + type=int, + default=None, + help="Total training steps. If not provided, will be calculated as num_epochs * steps_per_epoch", + ) + training_group.add_argument("--max-grad-norm", type=float, default=0.5) + training_group.add_argument( + "--ttt-length", + type=int, + default=7, + help="The length for Test-Time Training (TTT).", + ) + training_group.add_argument("--resume", action="store_true") + training_group.add_argument( + "--ckpt-dir", + type=str, + default=None, + help="directory includes the checkpoint to start training with", + ) + training_group.add_argument("--eval-interval", type=int, default=5000) + training_group.add_argument("--save-interval", type=int, default=5000) + training_group.add_argument( + "--log-interval", + type=int, + default=50, + help="Log training metrics every N steps", + ) + training_group.add_argument("--seed", type=int, default=0) + training_group.add_argument("--draft-accumulation-steps", type=int, default=1) + + # data processing type + optimization_group = parser.add_argument_group("optimization") + optimization_group.add_argument( + "--tp-size", + type=int, + default=1, + help="The size of the tensor parallel for the target model", + ) + # distributed training + optimization_group.add_argument("--sp-ulysses-size", type=int, default=1) + optimization_group.add_argument("--sp-ring-size", type=int, default=1) + optimization_group.add_argument( + "--attention-backend", + type=str, + default="flex_attention", + help="The attention backend for the draft model", + ) + + # other args + other_group = parser.add_argument_group("others") + other_group.add_argument("--cache-key", type=str, default=None) + other_group.add_argument("--cache-dir", type=str, default="./cache") + other_group.add_argument("--output-dir", type=str, required=True) + other_group.add_argument("--verbose", action="store_true") + other_group.add_argument( + "--dist-timeout", + type=int, + default=20, + help="Timeout for collective communication in minutes", + ) + other_group.add_argument( + "--model-download-dir", + type=str, + default=None, + help="The directory to download the target model to", + ) + + # vlm related args + vlm_group = parser.add_argument_group("vlm") + vlm_group.add_argument( + "--min-pixels", type=int, default=50176 + ) # 64*28*28 for qwen2.5-vl + vlm_group.add_argument( + "--max-pixels", type=int, default=802816 + ) # 1024*28*28 for qwen2.5-vl + + # profiling related args + profiling_group = parser.add_argument_group("profiling") + profiling_group.add_argument("--profile", action="store_true") + profiling_group.add_argument("--profile-start-step", type=int, default=30) + profiling_group.add_argument("--profile-num-steps", type=int, default=4) + profiling_group.add_argument("--profile-record-shapes", action="store_true") + + # sglang target model backend related args + sglang_group = parser.add_argument_group("sglang target model backend") + SGLangBackendArgs.add_args(sglang_group) + + # tracker related args + tracker_group = parser.add_argument_group("tracker") + TrackerArgs.add_args(tracker_group) + + args = parser.parse_args() + return parser, args + + +def build_tracker(args: Namespace, parser: ArgumentParser) -> Tracker: + """ + Build the experiment tracker according to the report_to argument. + + Args: + args: The arguments for the training script. + parser: The parser for the training script. + + Returns: + The experiment tracker. + """ + tracker_class = get_tracker_class(args.report_to) + if tracker_class: + tracker_class.validate_args(parser, args) + else: + parser.error(f"Unknown tracker: {args.report_to}") + tracker = create_tracker(args, args.output_dir) + return tracker + + +def build_target_model( + args: Namespace, draft_model_config: AutoDraftModelConfig, is_online: bool = True +) -> Tuple[Union[Eagle3TargetModel, TargetHead], Optional[AutoProcessor]]: + """ + Build the target model according to the arguments. + + Args: + args: The arguments for the training script. + draft_model_config: The draft model config. + + Returns: + The target model. + """ + if is_online: + if ( + args.is_vlm + and draft_model_config.target_model_type == "qwen2_5_vl" + and args.target_model_backend == "custom" + ): + from transformers import Qwen2_5_VLForConditionalGeneration + + target_model = ( + Qwen2_5_VLForConditionalGeneration.from_pretrained( + pretrained_model_name_or_path=args.target_model_path, + torch_dtype=torch.bfloat16, + ) + .eval() + .cuda() + ) + else: + if args.target_model_backend == "sglang": + target_model_kwargs = SGLangBackendArgs.from_args(args).to_kwargs() + else: + target_model_kwargs = {} + target_model = get_eagle3_target_model( + pretrained_model_name_or_path=args.target_model_path, + backend=args.target_model_backend, + torch_dtype=torch.bfloat16, + device="cuda", + cache_dir=args.model_download_dir, + **target_model_kwargs, + trust_remote_code=args.trust_remote_code, + ) + + # set the aux hidden states layers + if ( + hasattr(draft_model_config, "eagle_config") + and draft_model_config.eagle_config is not None + and "eagle_aux_hidden_state_layer_ids" in draft_model_config.eagle_config + ): + target_model.set_aux_hidden_states_layers( + draft_model_config.eagle_config["eagle_aux_hidden_state_layer_ids"] + ) + else: + target_model.set_aux_hidden_states_layers() + + if args.is_vlm: + processor = AutoProcessor.from_pretrained( + args.target_model_path, + min_pixels=args.min_pixels, + max_pixels=args.max_pixels, + ) + else: + processor = None + + return target_model, processor + else: + target_head = TargetHead.from_pretrained( + model_path=args.target_model_path, + lm_head_key=args.lm_head_key, + cache_dir=args.model_download_dir, + trust_remote_code=args.trust_remote_code, + ) + return target_head, None + + +def sanity_check(args: Namespace) -> None: + """ + Perform sanity checks on the arguments. + + Args: + args: The arguments for the training script. + + Returns: + None + """ + args.dp_size = dist.get_world_size() // args.tp_size + args.target_batch_size = args.tp_size * args.batch_size + if args.attention_backend == "usp": + sp_sanity_check(args) + + +def sp_sanity_check(args: Namespace) -> None: + args.draft_accumulation_steps = ( + args.draft_accumulation_steps * args.sp_ulysses_size * args.sp_ring_size + ) + assert ( + args.batch_size == 1 + ), f"USP only supports batch_size=1, got batch_size={args.batch_size}" + + assert args.sp_ring_size * args.sp_ulysses_size > 1, ( + f"USP requires sp_ring_size * sp_ulysses_size > 1. " + f"Got sp_ring_size={args.sp_ring_size}, sp_ulysses_size={args.sp_ulysses_size}." + ) + + assert args.train_hidden_states_path is not None, f"USP only support offline mode" + + if args.eval_data_path is not None and args.eval_hidden_states_path is not None: + raise ValueError( + "Cannot set both eval_data_path and eval_hidden_states_path. " + "For online mode, set only eval_data_path. " + "For offline mode, set only eval_hidden_states_path." + ) + + +def build_draft_model(args: Namespace) -> Tuple[AutoDraftModelConfig, nn.Module]: + # ckpt info(epoch, step) + ckpt_info = (0, 0) + + # Handle draft model config + if args.draft_model_config is None: + # Auto-generate and save config file + auto_config_path = create_draft_config_from_target( + target_model_path=args.target_model_path, cache_dir=args.model_download_dir + ) + draft_model_config = AutoDraftModelConfig.from_file(auto_config_path) + else: + # Use provided config file + draft_model_config = AutoDraftModelConfig.from_file(args.draft_model_config) + + # Handle base ckpt, config file + draft_model_last_checkpoint = None + is_resume_checkpoint = False + if args.ckpt_dir is not None: + if os.path.isdir(args.ckpt_dir): + draft_model_config = AutoDraftModelConfig.from_file( + os.path.join(args.ckpt_dir, "config.json") + ) + draft_model_last_checkpoint = args.ckpt_dir + print_on_rank0(f"Finetuning from base model: {draft_model_last_checkpoint}") + else: + raise ValueError( + f"Provided base model dir {args.ckpt_dir} is not a valid directory." + ) + + # detecting last ckpt for draft model + if args.resume and os.path.isdir(args.output_dir): + print_on_rank0(args.output_dir) + draft_model_last_checkpoint, ckpt_info = get_last_checkpoint(args.output_dir) + print(f"Last checkpoint detected: {draft_model_last_checkpoint}") + is_resume_checkpoint = True + + if draft_model_last_checkpoint: + draft_model = AutoEagle3DraftModel.from_pretrained( + draft_model_last_checkpoint, + attention_backend=args.attention_backend, + torch_dtype=torch.bfloat16, + ).cuda() + else: + draft_model = AutoEagle3DraftModel.from_config( + draft_model_config, + attention_backend=args.attention_backend, + torch_dtype=torch.bfloat16, + ).cuda() + + # Load training state (optimizer, scheduler, epoch, step) for true resume + resume_state = None + if is_resume_checkpoint and draft_model_last_checkpoint: + training_state_path = os.path.join( + draft_model_last_checkpoint, "training_state.pt" + ) + if os.path.exists(training_state_path): + resume_state = torch.load( + training_state_path, map_location="cpu", weights_only=False + ) + print_on_rank0( + f"Loaded training state from {training_state_path}: " + f"epoch={resume_state['epoch']}, step={resume_state['global_step']}" + ) + + draft_model.load_embedding(args.target_model_path, embedding_key=args.embedding_key) + draft_model.freeze_embedding() + return draft_model_config, draft_model, ckpt_info, resume_state + + +def build_dataloaders( + args: Namespace, + draft_model_config: AutoDraftModelConfig, + processor: Optional[AutoProcessor] = None, +) -> Tuple[DataLoader, str, Optional[DataLoader]]: + # build dataloaders + tokenizer = AutoTokenizer.from_pretrained( + args.target_model_path, trust_remote_code=args.trust_remote_code + ) + + # convert to dataloader + cache_params_string = ( + f"{args.train_data_path}-" + f"{args.max_length}-" + f"{args.chat_template}-" + f"{args.target_model_path}" # Tokenizer may also different + ) + cache_key = hashlib.md5(cache_params_string.encode()).hexdigest() + train_dataset = Dataset.from_generator( + generator=safe_conversations_generator, + gen_kwargs={"file_path": args.train_data_path}, + ) + is_online = ( + args.train_data_path is not None and args.train_hidden_states_path is None + ) + with rank_0_priority(): + train_eagle3_dataset = build_eagle3_dataset( + dataset=train_dataset, + tokenizer=tokenizer, + chat_template=args.chat_template, + max_length=args.max_length, + cache_dir=os.path.join(args.cache_dir, "processed_dataset"), + cache_key=cache_key, + is_vlm=args.is_vlm, + is_preformatted=args.is_preformatted, + processor=processor, + num_proc=args.build_dataset_num_proc, + train_only_last_turn=args.train_only_last_turn, + ) + vocab_mapping_path = generate_vocab_mapping_file( + dataset=train_eagle3_dataset, + target_vocab_size=draft_model_config.vocab_size, + draft_vocab_size=draft_model_config.draft_vocab_size, + cache_dir=os.path.join(args.cache_dir, "vocab_mapping"), + cache_key=cache_key, + ) + + if not is_online: + train_eagle3_dataset = build_offline_eagle3_dataset( + args.train_hidden_states_path, + args.max_length, + ttt_length=args.ttt_length, + use_usp_preprocess=(args.attention_backend == "usp"), + ) + + train_dataloader = prepare_dp_dataloaders( + train_eagle3_dataset, + args.target_batch_size, + num_workers=args.dataloader_num_workers, + shuffle=True, + process_group=( + get_draft_dp_group() + if args.attention_backend == "usp" and not is_online + else get_dp_group() + ), + is_vlm=args.is_vlm, + ) + if args.eval_data_path is not None or args.eval_hidden_states_path is not None: + if args.eval_data_path is not None: + eval_dataset = Dataset.from_generator( + generator=safe_conversations_generator, + gen_kwargs={"file_path": args.eval_data_path}, + ) + eval_eagle3_dataset = build_eagle3_dataset( + eval_dataset, + tokenizer, + args.chat_template, + args.max_length, + is_vlm=args.is_vlm, + processor=processor, + num_proc=args.build_dataset_num_proc, + is_preformatted=args.is_preformatted, + train_only_last_turn=args.train_only_last_turn, + ) + elif args.eval_hidden_states_path is not None: + eval_eagle3_dataset = build_offline_eagle3_dataset( + args.eval_hidden_states_path, + args.max_length, + ttt_length=args.ttt_length, + use_usp_preprocess=(args.attention_backend == "usp"), + ) + eval_dataloader = prepare_dp_dataloaders( + eval_eagle3_dataset, + args.target_batch_size, + num_workers=args.dataloader_num_workers, + shuffle=False, + process_group=( + get_draft_dp_group() + if args.attention_backend == "usp" and not is_online + else get_dp_group() + ), + is_vlm=args.is_vlm, + ) + print_with_rank("Initialized eval dataloader") + else: + eval_dataloader = None + return ( + train_dataloader, + vocab_mapping_path, + eval_dataloader, + ) + + +def save_checkpoints( + args: Namespace, + epoch: int, + step: int, + eagle3_model: nn.Module, + optimizer: Optimizer, +): + epoch_output_dir = os.path.join(args.output_dir, f"epoch_{epoch}_step_{step}") + if dist.get_rank() == 0: + os.makedirs(epoch_output_dir, exist_ok=True) + dist.barrier() + + with FSDP.state_dict_type(eagle3_model, StateDictType.FULL_STATE_DICT): + model_state_dict = eagle3_model.state_dict() + state_to_save = { + "epoch": epoch, + "global_step": step, + "args": args, + } + state_to_save.update(optimizer.state_dict()) + draft_model_state_dict = { + k.replace("draft_model.", ""): v + for k, v in model_state_dict.items() + if "draft_model." in k and "embed" not in k.lower() + } + + if dist.get_rank() == 0: + torch.save( + state_to_save, + os.path.join(epoch_output_dir, "training_state.pt"), + ) + print_on_rank0( + f"Saved full training state to {epoch_output_dir}/training_state.pt" + ) + eagle3_model.draft_model.save_pretrained( + epoch_output_dir, + state_dict=draft_model_state_dict, + ) + print_on_rank0(f"Saved model configuration to {epoch_output_dir}") + dist.barrier() + + +def run_forward( + args: Namespace, + eagle3_model: nn.Module, + data: dict, + target_model: Optional[Eagle3TargetModel] = None, + is_online: bool = True, +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + if args.is_vlm and args.target_model_backend == "custom": + plosses, _, acces = eagle3_model( + input_ids=data["input_ids"].cuda(), + attention_mask=data["attention_mask"].cuda(), + loss_mask=data["loss_mask"].cuda(), + pixel_values=data["pixel_values"].cuda(), + image_grid_thw=data["image_grid_thw"].cuda(), + ) + else: + image_grid_thw = None + if is_online: + # we generate the eagle3 using the target model in an online fashion + # Handle VLM data: pixel_values and image_grid_thw are lists + # pixel_values = [pv.cuda() for pv in data["pixel_values"]] if args.is_vlm else None + if args.is_vlm: + image_grid_thw = ( + [thw.cuda().squeeze() for thw in data["image_grid_thw"]] + if args.is_vlm + else None + ) + pixel_values = data["pixel_values"].cuda() + eagle3_data = target_model.generate_eagle3_data( + input_ids=data["input_ids"].cuda(), + attention_mask=data["attention_mask"].cuda(), + loss_mask=data["loss_mask"].cuda(), + is_vlm=args.is_vlm, + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + else: + eagle3_data = target_model.generate_eagle3_data( + input_ids=data["input_ids"].cuda(), + attention_mask=data["attention_mask"].cuda(), + loss_mask=data["loss_mask"].cuda(), + ) + + input_ids = get_dp_data_shard_from_tp(eagle3_data.input_ids) + attention_mask = get_dp_data_shard_from_tp(eagle3_data.attention_mask) + loss_mask = get_dp_data_shard_from_tp(eagle3_data.loss_mask) + target = get_dp_data_shard_from_tp(eagle3_data.target) + hidden_states = get_dp_data_shard_from_tp(eagle3_data.hidden_states) + else: + # we generate the logits using the hidden states loaded from disk + attention_mask = data["attention_mask"].cuda() + hidden_states = data["hidden_state"].cuda() + input_ids, target, loss_mask = target_model.preprocess( + data["input_ids"], data["target"], data["loss_mask"] + ) + input_ids = input_ids.cuda() + target = target_model( + target.cuda() + ) # The `data['target']` value occupies a large amount of GPU memory, with a shape of [seqlen, vocab_size]. It needs to be processed before being loaded into the GPU. + loss_mask = loss_mask.cuda() + plosses, _, acces = eagle3_model( + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + target=target, + hidden_states=hidden_states, + position_ids=( + data["position_ids"].cuda() if "position_ids" in data else None + ), + image_grid_thw=image_grid_thw, + is_vlm=args.is_vlm, + ) + return plosses, acces + + +def run_backward_and_update( + args: Namespace, plosses: List[torch.Tensor], optimizer: Optimizer, global_step: int +) -> None: + ploss_weight = [0.8**i for i in range(len(plosses))] + ploss = ( + sum([ploss_weight[i] * plosses[i] for i in range(len(plosses))]) + / args.draft_accumulation_steps + ) + ploss.backward() + + if global_step % args.draft_accumulation_steps == 0: + optimizer.step() + + +def record_metrcs( + args: Namespace, + accuracies: List[torch.Tensor], + plosses: List[torch.Tensor], + global_step: int, + tracker: Tracker, + optimizer: Optional[Optimizer] = None, + mode: str = "train", +) -> None: + logdict = {} + + if mode == "train" and optimizer is not None: + logdict["train/lr"] = optimizer.get_learning_rate() + + accuracies = torch.stack(accuracies) + plosses = torch.stack(plosses) + + assert accuracies.shape[0] == args.ttt_length + dist.all_reduce(accuracies, op=dist.ReduceOp.AVG) + accuracies = accuracies.cpu().tolist() + for i in range(len(accuracies)): + logdict[f"{mode}/acc_{i}"] = accuracies[i] + print_on_rank0( + f"Eval - Step {global_step} [{global_step + 1}/{args.num_epochs}], position {i}, Acc: {accuracies[i]:.2f}" + ) + + dist.all_reduce(plosses, op=dist.ReduceOp.AVG) + plosses = plosses.cpu().tolist() + for i in range(len(plosses)): + logdict[f"{mode}/ploss_{i}"] = plosses[i] + print_on_rank0( + f"Eval - Step {global_step} [{global_step + 1}/{args.num_epochs}], position {i}, pLoss: {plosses[i]}" + ) + tracker.log(logdict, step=global_step) + + +def get_dp_data_shard_from_tp(tensor: torch.Tensor) -> torch.Tensor: + """ + Get the data shard from the tensor. + """ + tp_size = dist.get_world_size(get_tp_group()) + tp_rank = dist.get_rank(get_tp_group()) + return tensor.chunk(tp_size, dim=0)[tp_rank] + + +def main(): + # ================================================ + # 1. Initialize + # ================================================ + parser, args = parse_args() + set_seed(args.seed) + init_distributed( + timeout=args.dist_timeout, + tp_size=args.tp_size, + sp_ring_size=args.sp_ring_size, + sp_ulysses_size=args.sp_ulysses_size, + ) + is_online = ( + args.train_data_path is not None and args.train_hidden_states_path is None + ) + + sanity_check(args) + print_args_with_dots(args) + print_with_rank("Initialized distributed environment") + + # ================================================ + # 2. Build models + # ================================================ + draft_model_config, draft_model, ckpt_info, resume_state = build_draft_model(args) + target_model, processor = build_target_model(args, draft_model_config, is_online) + + # ================================================ + # 3. Build dataloader + # ================================================ + train_dataloader, vocab_mapping_path, eval_dataloader = build_dataloaders( + args, draft_model_config, processor + ) + + # we load the vocab mapping then + draft_model.load_vocab_mapping(vocab_mapping_path) + print_with_rank("Loaded vocab mapping") + + # Calculate total steps if not provided + if args.total_steps is None: + steps_per_epoch = math.ceil( + len(train_dataloader) / args.draft_accumulation_steps + ) + args.total_steps = args.num_epochs * steps_per_epoch + print_with_rank( + f"Auto-calculated total_steps: {args.total_steps} (num_epochs={args.num_epochs} * steps_per_epoch={steps_per_epoch})" + ) + else: + print_with_rank(f"Using provided total_steps: {args.total_steps}") + + # ================================================ + # 4. Build Eagle3 model + # ================================================ + if ( + args.is_vlm + and getattr(draft_model_config, "target_model_type", None) == "qwen2_5_vl" + and args.tp_size == 1 + and args.target_model_backend != "sglang" + ): + eagle3_model = QwenVLOnlineEagle3Model( + target_model=target_model, + draft_model=draft_model, + processor=processor, + length=args.ttt_length, + attention_backend=args.attention_backend, + ) + else: + if is_online: + eagle3_model = OnlineEagle3Model( + target_model=target_model, + draft_model=draft_model, + length=args.ttt_length, + attention_backend=args.attention_backend, + ) + else: + # offline: the target_model is TargetHead not a model + eagle3_model = OnlineEagle3Model( + draft_model=draft_model, + length=args.ttt_length, + attention_backend=args.attention_backend, + ) + eagle3_model = FSDP( + eagle3_model, + use_orig_params=True, + mixed_precision=MixedPrecision( + param_dtype=torch.bfloat16, + buffer_dtype=torch.bfloat16, + ), + sharding_strategy=ShardingStrategy.SHARD_GRAD_OP, + process_group=dist.group.WORLD, # the draft model should run dp for all processes + ) + print_with_rank("Initialized Eagle3 FSDP model") + + # ================================================ + # 5. Build optimizer and scheduler + # ================================================ + optimizer = BF16Optimizer( + draft_model, + lr=args.learning_rate, + max_grad_norm=args.max_grad_norm, + warmup_ratio=args.warmup_ratio, + total_steps=args.total_steps, + ) + print_with_rank("Initialized optimizer and scheduler") + + # Restore optimizer/scheduler state for true resume + if resume_state is not None: + optimizer.load_state_dict(resume_state) + start_epoch = resume_state["epoch"] + global_step = resume_state["global_step"] + print_on_rank0( + f"Restored optimizer/scheduler state: " + f"epoch={start_epoch}, step={global_step}, " + f"lr={optimizer.get_learning_rate():.6f}" + ) + del resume_state + else: + start_epoch = ckpt_info[0] + global_step = ckpt_info[1] + + # Calculate how many steps to skip in the current epoch (for dataloader fast-forward) + skip_steps = global_step - start_epoch * len(train_dataloader) + + # ================================================ + # 6. Build tracker + # ================================================ + tracker = build_tracker(args, parser) + dist.barrier() + + last_time = time.time() + + # ================================================ + # 7. Start training + # ================================================ + print_on_rank0( + f"Starting training from epoch:{start_epoch} step:{global_step}" + ) + + for epoch in range(start_epoch, args.num_epochs): + # Run training + train_dataloader.sampler.set_epoch(epoch + 1) + draft_model.train() + + if dist.get_rank() == 0: + progress_bar = tqdm( + train_dataloader, desc=f"Training Epoch {epoch}", leave=True + ) + else: + progress_bar = train_dataloader + + for step_in_epoch, data in enumerate(progress_bar): + # Skip steps already processed in the current epoch when resuming + if epoch == start_epoch and step_in_epoch < skip_steps: + continue + + global_step += 1 + + # ================================================ + # 7.0 Profiling + # ================================================ + if args.profile: + # we add the step by 1 to align with global step + if global_step == args.profile_start_step + 1: + print("Start profile") + torch_profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + with_stack=True, + record_shapes=args.profile_record_shapes, + ) + torch_profiler.start() + if global_step == args.profile_start_step + args.profile_num_steps + 1: + output_path = os.path.join( + args.output_dir, + f"profile_rank{torch.distributed.get_rank()}_{time.time()}.trace.json.gz", + ) + print(f"End profile {output_path=}") + torch_profiler.stop() + torch_profiler.export_chrome_trace(output_path) + + # ================================================ + # 7.1 Training Step + # ================================================ + plosses, acces = run_forward( + args, + eagle3_model, + data, + target_model, + is_online, + ) + run_backward_and_update(args, plosses, optimizer, global_step) + + # log training metrics + if global_step % (args.log_interval * args.draft_accumulation_steps) == 0: + record_metrcs( + args, + acces, + plosses, + global_step // args.draft_accumulation_steps, + tracker, + optimizer, + mode="train", + ) + + if dist.get_rank() == 0: + time_per_step = time.time() - last_time + last_time = time.time() + avg_loss = sum(pl for pl in plosses) / len(plosses) + avg_acc = sum(acces) / len(acces) + progress_bar.set_postfix( + { + "loss": f"{avg_loss:.2f}", + "acc": f"{avg_acc:.2f}", + "time": f"{time_per_step:.2f}s", + } + ) + + # ================================================ + # 7.2 Evaluation Step + # ================================================ + should_evaluate = ( + args.eval_data_path is not None + or args.eval_hidden_states_path is not None + ) + if ( + should_evaluate + and global_step % (args.eval_interval * args.draft_accumulation_steps) + == 0 + ): + # Run evaluation + draft_model.eval() + eval_acces = [[] for _ in range(eagle3_model.length)] + eval_plosses = [[] for _ in range(eagle3_model.length)] + + for data in tqdm(eval_dataloader, desc=f"Evaluating Epoch {epoch}"): + with torch.no_grad(): + plosses, acces = run_forward( + args, eagle3_model, data, target_model, is_online + ) + eval_acces = [ + eval_acces[i] + [acces[i]] for i in range(len(acces)) + ] + eval_plosses = [ + eval_plosses[i] + [plosses[i]] for i in range(len(plosses)) + ] + + # compute average over all minibatches + eval_acces = [torch.stack(acc).mean() for acc in eval_acces] + eval_plosses = [torch.stack(pl).mean() for pl in eval_plosses] + + record_metrcs( + args, + eval_acces, + eval_plosses, + global_step // args.draft_accumulation_steps, + tracker, + mode="eval", + ) + # ================================================ + # 7.3 Save Checkpoints + # ================================================ + if global_step % args.save_interval == 0: + # Save the model + save_checkpoints(args, epoch, global_step, eagle3_model, optimizer) + + if args.max_num_steps is not None and global_step >= args.max_num_steps: + break + + if args.max_num_steps is not None and global_step >= args.max_num_steps: + break + # Save final checkpoint if training ended without saving + if global_step % args.save_interval != 0: + print_on_rank0( + f"Training completed at step {global_step}, saving final checkpoint..." + ) + save_checkpoints(args, epoch, global_step, eagle3_model, optimizer) + + # Close the tracker + tracker.close() + destroy_distributed() + + +if __name__ == "__main__": + main() diff --git a/idea1/specforge.egg-info/PKG-INFO b/idea1/specforge.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..499210115508c2ba447eb29d40c44faf3c0e6d31 --- /dev/null +++ b/idea1/specforge.egg-info/PKG-INFO @@ -0,0 +1,106 @@ +Metadata-Version: 2.4 +Name: specforge +Version: 0.2.0 +Summary: SpecForge: Speculative Decoding Training Framework +Author: SGLang Team +Project-URL: Homepage, https://github.com/sgl-project/SpecForge +Requires-Python: >=3.11 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: pre-commit +Requires-Dist: torch==2.9.1 +Requires-Dist: torchaudio==2.9.1 +Requires-Dist: torchvision==0.24.1 +Requires-Dist: transformers==4.57.1 +Requires-Dist: qwen-vl-utils==0.0.11 +Requires-Dist: datasets +Requires-Dist: setuptools +Requires-Dist: tqdm +Requires-Dist: wandb +Requires-Dist: psutil +Requires-Dist: numpy +Requires-Dist: accelerate +Requires-Dist: pydantic +Requires-Dist: sglang==0.5.9 +Requires-Dist: openai-harmony +Requires-Dist: ninja +Requires-Dist: packaging +Requires-Dist: yunchang +Requires-Dist: tensorboard +Provides-Extra: dev +Requires-Dist: pre-commit; extra == "dev" +Requires-Dist: unittest; extra == "dev" +Provides-Extra: fa +Requires-Dist: flash-attn; extra == "fa" +Dynamic: license-file + +
+logo + +[![documentation](https://img.shields.io/badge/📖-Documentation-red.svg?style=flat)](https://docs.sglang.ai/SpecForge/) +[![SpecBundle](https://img.shields.io/badge/🤗%20SpecBundle-yellow.svg?style=flat)](https://huggingface.co/collections/lmsys/specbundle) +[![DeepWiki](https://img.shields.io/badge/DeepWiki-SpecForge-blue.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACwAAAAyCAYAAAAnWDnqAAAAAXNSR0IArs4c6QAAA05JREFUaEPtmUtyEzEQhtWTQyQLHNak2AB7ZnyXZMEjXMGeK/AIi+QuHrMnbChYY7MIh8g01fJoopFb0uhhEqqcbWTp06/uv1saEDv4O3n3dV60RfP947Mm9/SQc0ICFQgzfc4CYZoTPAswgSJCCUJUnAAoRHOAUOcATwbmVLWdGoH//PB8mnKqScAhsD0kYP3j/Yt5LPQe2KvcXmGvRHcDnpxfL2zOYJ1mFwrryWTz0advv1Ut4CJgf5uhDuDj5eUcAUoahrdY/56ebRWeraTjMt/00Sh3UDtjgHtQNHwcRGOC98BJEAEymycmYcWwOprTgcB6VZ5JK5TAJ+fXGLBm3FDAmn6oPPjR4rKCAoJCal2eAiQp2x0vxTPB3ALO2CRkwmDy5WohzBDwSEFKRwPbknEggCPB/imwrycgxX2NzoMCHhPkDwqYMr9tRcP5qNrMZHkVnOjRMWwLCcr8ohBVb1OMjxLwGCvjTikrsBOiA6fNyCrm8V1rP93iVPpwaE+gO0SsWmPiXB+jikdf6SizrT5qKasx5j8ABbHpFTx+vFXp9EnYQmLx02h1QTTrl6eDqxLnGjporxl3NL3agEvXdT0WmEost648sQOYAeJS9Q7bfUVoMGnjo4AZdUMQku50McDcMWcBPvr0SzbTAFDfvJqwLzgxwATnCgnp4wDl6Aa+Ax283gghmj+vj7feE2KBBRMW3FzOpLOADl0Isb5587h/U4gGvkt5v60Z1VLG8BhYjbzRwyQZemwAd6cCR5/XFWLYZRIMpX39AR0tjaGGiGzLVyhse5C9RKC6ai42ppWPKiBagOvaYk8lO7DajerabOZP46Lby5wKjw1HCRx7p9sVMOWGzb/vA1hwiWc6jm3MvQDTogQkiqIhJV0nBQBTU+3okKCFDy9WwferkHjtxib7t3xIUQtHxnIwtx4mpg26/HfwVNVDb4oI9RHmx5WGelRVlrtiw43zboCLaxv46AZeB3IlTkwouebTr1y2NjSpHz68WNFjHvupy3q8TFn3Hos2IAk4Ju5dCo8B3wP7VPr/FGaKiG+T+v+TQqIrOqMTL1VdWV1DdmcbO8KXBz6esmYWYKPwDL5b5FA1a0hwapHiom0r/cKaoqr+27/XcrS5UwSMbQAAAABJRU5ErkJggg==)](https://deepwiki.com/sgl-project/SpecForge) + +[![github badge](https://img.shields.io/badge/📃%20LMSYS-Blog-black.svg?style=flat)](https://lmsys.org/blog/2025-07-25-spec-forge/) +[![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://sgl-fru7574.slack.com/archives/C09784E3EN6) +[![license](https://img.shields.io/badge/License-MIT%202.0-blue)](./LICENSE) + +
+ +## 📍 Overview + +SpecForge is an ecosystem project developed by the SGLang team. It is a framework for training speculative decoding models so that you can smoothly port them over to the SGLang serving framework to speed up your inference. + +We have seen many open-source projects for speculative decoding, but most of them are not well-maintained or not directly compatible with SGLang. We prepared this project because we wish that the open-source community can enjoy a speculative decoding framework that is +- regularly maintained by the SpecForge team: the code is runnable out-of-the-box +- directly compatible with SGLang: there is no additional efforts for porting to SGLang +- provide performant training capabilities: we provided online/offline/tensor-parallel/FSDP to suit your needs + + +Check out [**our documentation**](https://docs.sglang.ai/SpecForge/) to get started. + + +## 🚀 Accelerate with SpecBundle + +SpecBundle is a collection of production-grade speculative decoding models that are released by the SpecForge team and our industry partners. They provide higher acceptance rate compared to the existing open-source checkpoints over a wide range of domains. Together with SGLang, you can experience up to 4x speedup for inference. Check out our resources below: + + +| Item | Link | +| --- | --- | +| 📝 Documentation | [Link](https://docs.sglang.io/SpecForge/community_resources/specbundle.html) | +| 📊 Performance Dashboard | [Link](https://docs.sglang.io/SpecForge/SpecBundle/index.html) | +| 🤗 Hugging Face Collection | [Link](https://huggingface.co/collections/lmsys/specbundle) | + + +## 🎉 News + +- [2025-12] 🎉 Released SpecBundle (phase 1) and SpecForge v0.2. Check out our blog at [LMSYS.org](https://lmsys.org/blog/2025-12-23-spec-bundle-phase-1/) +- [2025-12] 🔔 Released the roadmap for 2026 Q1. +- [2025-08] 🔔 SpecForge is listed as a [flagship project](https://lmsys.org/about/) in LMSYS. Congratulations to the SpecForge team! +- [2025-08] 🔥 SpecForge powered the Eagle3 draft model for GPT-OSS. Check out the blog at [LMSYS.org](https://lmsys.org/blog/2025-08-27-gpt-oss/) +- [2025-07] 🔥 SpecForge is released together with Llama4-Eagle3 checkpoints. Check out our blog at [LMSYS.org](https://lmsys.org/blog/2025-07-25-spec-forge/) + +## ✨ Acknowledgements + +acknowledgements + +We would like to express our sincere gratitude to the official EAGLE team, especially Hongyang Zhang and Yuhui Li, for their invaluable contributions and support. Our thanks also go to the NVIDIA team—particularly Avery H and Izzy Putterman—and to the Google team, especially Ying Wang, for their insightful discussions and generous assistance throughout the project. + +We are especially grateful to Meituan for their strong backing and meaningful contributions, which played a vital role in driving this project forward. + +This project has also been inspired by many outstanding open-source projects from the LLM community, including [EAGLE](https://github.com/SafeAILab/EAGLE), [BaldEagle](https://github.com/NickL77/BaldEagle), and [TensorRT-Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) and others. Their contributions and shared knowledge have greatly benefited our work. + +## 💡 Special Thanks to Voltage Park + +We would like to extend our sincere thanks to [Voltage Park](https://www.voltagepark.com/), our official infrastructure partner. As part of a formal collaboration with the SGLang team, Voltage Park provided critical GPU resources that empowered us to train and evaluate large-scale speculative decoding models efficiently and reliably. This partnership was instrumental in making SpecForge possible. We deeply appreciate Voltage Park’s mission to make cutting-edge AI infrastructure more accessible, and we look forward to continued collaboration as we push the boundaries of open-source LLM serving and optimization. + +## 📃 Citation + +```bibtex +@misc{specforge2025, + title={SpecForge: Train speculative decoding models effortlessly}, + author={Shenggui Li, Yikai Zhu, Chao Wang, Fan Yin, Shuai Shi, Yubo Wang, Yi Zhang, Yingyi Huang, Haoshuai Zheng, Yineng Zhang}, + year={2025}, + publisher={GitHub}, + howpublished={\url{https://github.com/sgl-project/specforge}}, +} diff --git a/idea1/specforge.egg-info/SOURCES.txt b/idea1/specforge.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c3203fd7f160d8c3ff2d54a15752f2314af60ad --- /dev/null +++ b/idea1/specforge.egg-info/SOURCES.txt @@ -0,0 +1,82 @@ +LICENSE +MANIFEST.in +README.md +pyproject.toml +version.txt +benchmarks/__init__.py +benchmarks/bench_eagle3.py +benchmarks/benchmarker/__init__.py +benchmarks/benchmarker/aime.py +benchmarks/benchmarker/base.py +benchmarks/benchmarker/ceval.py +benchmarks/benchmarker/financeqa.py +benchmarks/benchmarker/gpqa.py +benchmarks/benchmarker/gsm8k.py +benchmarks/benchmarker/humaneval.py +benchmarks/benchmarker/livecodebench.py +benchmarks/benchmarker/math500.py +benchmarks/benchmarker/mmlu.py +benchmarks/benchmarker/mmstar.py +benchmarks/benchmarker/mtbench.py +benchmarks/benchmarker/registry.py +benchmarks/benchmarker/simpleqa.py +benchmarks/benchmarker/utils.py +docs/conf.py +docs/deploy.py +specforge/__init__.py +specforge/args.py +specforge/distributed.py +specforge/lr_scheduler.py +specforge/optimizer.py +specforge/tracker.py +specforge/utils.py +specforge.egg-info/PKG-INFO +specforge.egg-info/SOURCES.txt +specforge.egg-info/dependency_links.txt +specforge.egg-info/requires.txt +specforge.egg-info/top_level.txt +specforge/benchmarks/benchmark_flex_attention.py +specforge/benchmarks/benchmark_loss.py +specforge/core/__init__.py +specforge/core/dflash.py +specforge/core/eagle3.py +specforge/core/eagle3_adapters.py +specforge/core/loss.py +specforge/data/__init__.py +specforge/data/parse.py +specforge/data/preprocessing.py +specforge/data/template.py +specforge/data/utils.py +specforge/layers/__init__.py +specforge/layers/embedding.py +specforge/layers/linear.py +specforge/layers/lm_head.py +specforge/layers/ring/__init__.py +specforge/layers/ring/ring_flash_attn.py +specforge/layers/ring/utils.py +specforge/modeling/__init__.py +specforge/modeling/_mask_utils.py +specforge/modeling/auto.py +specforge/modeling/utils.py +specforge/modeling/draft/__init__.py +specforge/modeling/draft/base.py +specforge/modeling/draft/dflash.py +specforge/modeling/draft/flex_attention.py +specforge/modeling/draft/llama3_eagle.py +specforge/modeling/target/__init__.py +specforge/modeling/target/dflash_target_model.py +specforge/modeling/target/eagle3_target_model.py +specforge/modeling/target/target_head.py +specforge/modeling/target/target_utils.py +specforge/modeling/target/custom_backend/__init__.py +specforge/modeling/target/custom_backend/gpt_oss.py +specforge/modeling/target/custom_backend/llama.py +specforge/modeling/target/custom_backend/llama4.py +specforge/modeling/target/custom_backend/phi3.py +specforge/modeling/target/custom_backend/qwen2.py +specforge/modeling/target/custom_backend/qwen3.py +specforge/modeling/target/custom_backend/qwen3_moe.py +specforge/modeling/target/sglang_backend/__init__.py +specforge/modeling/target/sglang_backend/model_runner.py +specforge/modeling/target/sglang_backend/patch.py +specforge/modeling/target/sglang_backend/utils.py \ No newline at end of file diff --git a/idea1/specforge.egg-info/dependency_links.txt b/idea1/specforge.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/idea1/specforge.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/idea1/specforge.egg-info/requires.txt b/idea1/specforge.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..30b4f5651ecc9db4f8769ed2bf977b61ece9353b --- /dev/null +++ b/idea1/specforge.egg-info/requires.txt @@ -0,0 +1,27 @@ +pre-commit +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 +transformers==4.57.1 +qwen-vl-utils==0.0.11 +datasets +setuptools +tqdm +wandb +psutil +numpy +accelerate +pydantic +sglang==0.5.9 +openai-harmony +ninja +packaging +yunchang +tensorboard + +[dev] +pre-commit +unittest + +[fa] +flash-attn diff --git a/idea1/specforge.egg-info/top_level.txt b/idea1/specforge.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d626eec0b5e83f5f9a832244cfd9adce2cd52e9 --- /dev/null +++ b/idea1/specforge.egg-info/top_level.txt @@ -0,0 +1,6 @@ +assets +benchmarks +datasets +docs +examples +specforge diff --git a/idea1/specforge/__init__.py b/idea1/specforge/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b07280a0d9e106da207bd1b75de4a22a2de215b1 --- /dev/null +++ b/idea1/specforge/__init__.py @@ -0,0 +1,4 @@ +from .core import * # noqa +from .modeling import * # noqa + +__all__ = ["modeling", "core"] diff --git a/idea1/specforge/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a575f9097db15f2881bfa93782129308c92478be Binary files /dev/null and b/idea1/specforge/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/__pycache__/__init__.cpython-313.pyc b/idea1/specforge/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf16ac048c68b1f04f74f259ef0c78645ee2585f Binary files /dev/null and b/idea1/specforge/__pycache__/__init__.cpython-313.pyc differ diff --git a/idea1/specforge/__pycache__/args.cpython-311.pyc b/idea1/specforge/__pycache__/args.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18cf2f2e82d205210e18b7f7128a645eb0134b2c Binary files /dev/null and b/idea1/specforge/__pycache__/args.cpython-311.pyc differ diff --git a/idea1/specforge/__pycache__/distributed.cpython-311.pyc b/idea1/specforge/__pycache__/distributed.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f3244f49b2f8f2753674896e9d0e10d2ea58017 Binary files /dev/null and b/idea1/specforge/__pycache__/distributed.cpython-311.pyc differ diff --git a/idea1/specforge/__pycache__/lr_scheduler.cpython-311.pyc b/idea1/specforge/__pycache__/lr_scheduler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c8950cbd186cf640f0141b92acefb485dc23a19 Binary files /dev/null and b/idea1/specforge/__pycache__/lr_scheduler.cpython-311.pyc differ diff --git a/idea1/specforge/__pycache__/optimizer.cpython-311.pyc b/idea1/specforge/__pycache__/optimizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..016174e30cb5a2d351e3ab36789c35c451c54fef Binary files /dev/null and b/idea1/specforge/__pycache__/optimizer.cpython-311.pyc differ diff --git a/idea1/specforge/__pycache__/tracker.cpython-311.pyc b/idea1/specforge/__pycache__/tracker.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68d586fb803051ee638d7826a9270fd28509a77d Binary files /dev/null and b/idea1/specforge/__pycache__/tracker.cpython-311.pyc differ diff --git a/idea1/specforge/__pycache__/utils.cpython-311.pyc b/idea1/specforge/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b5c8a240a93086dd08f7d4051859cd81f9a58a2 Binary files /dev/null and b/idea1/specforge/__pycache__/utils.cpython-311.pyc differ diff --git a/idea1/specforge/args.py b/idea1/specforge/args.py new file mode 100644 index 0000000000000000000000000000000000000000..2cd5efc30f5500b8df7f93fe2963de0dd7e38162 --- /dev/null +++ b/idea1/specforge/args.py @@ -0,0 +1,219 @@ +import argparse +from dataclasses import dataclass +from typing import Any, Dict, List + +from sglang.srt.server_args import ATTENTION_BACKEND_CHOICES + + +@dataclass +class TrackerArgs: + report_to: str = "none" + wandb_project: str = None + wandb_name: str = None + wandb_key: str = None + wandb_offline: bool = False + wandb_dir: str = None + swanlab_project: str = None + swanlab_name: str = None + swanlab_key: str = None + mlflow_experiment_id: str = None + mlflow_run_name: str = None + mlflow_run_id: str = None + mlflow_tracking_uri: str = None + mlflow_registry_uri: str = None + + @staticmethod + def add_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--report-to", + type=str, + default="none", + choices=["wandb", "tensorboard", "swanlab", "mlflow", "none"], + help="The integration to report results and logs to.", + ) + # wandb-specific args + parser.add_argument("--wandb-project", type=str, default=None) + parser.add_argument("--wandb-name", type=str, default=None) + parser.add_argument("--wandb-key", type=str, default=None, help="W&B API key.") + parser.add_argument( + "--wandb-offline", + action="store_true", + help="Enable W&B offline mode and store logs locally.", + ) + parser.add_argument( + "--wandb-dir", + type=str, + default=None, + help="Directory to store W&B files. Defaults to './wandb' under the project root when using W&B.", + ) + # swanlab-specific args + parser.add_argument( + "--swanlab-project", + type=str, + default=None, + help="The project name for swanlab.", + ) + parser.add_argument( + "--swanlab-name", + type=str, + default=None, + help="The experiment name for swanlab.", + ) + parser.add_argument( + "--swanlab-key", + type=str, + default=None, + help="The API key for swanlab non-interactive login.", + ) + # mlflow-specific args + parser.add_argument( + "--mlflow-tracking-uri", + type=str, + default=None, + help="The MLflow tracking URI. If not set, uses MLFLOW_TRACKING_URI environment variable or defaults to local './mlruns'.", + ) + parser.add_argument( + "--mlflow-experiment-name", + type=str, + default=None, + help="The MLflow experiment name. If not set, uses MLFLOW_EXPERIMENT_NAME environment variable.", + ) + parser.add_argument( + "--mlflow-run-name", + type=str, + default=None, + help="The MLflow run name. If not set, MLflow will auto-generate one.", + ) + + +@dataclass +class SGLangBackendArgs: + sglang_attention_backend: str = "fa3" + sglang_mem_fraction_static: float = 0.4 + sglang_context_length: int = None + sglang_enable_nccl_nvls: bool = False + sglang_enable_symm_mem: bool = False + sglang_enable_torch_compile: bool = True + sglang_enable_dp_attention: bool = False + sglang_enable_dp_lm_head: bool = False + sglang_enable_piecewise_cuda_graph: bool = False + sglang_piecewise_cuda_graph_max_tokens: int = 4096 + sglang_piecewise_cuda_graph_tokens: List[int] = None + sglang_ep_size: int = 1 + sglang_max_running_requests: int = None # assign based on batch size + sglang_max_total_tokens: int = None # assign based on batch size and seq length + + @staticmethod + def add_args(parser: argparse.ArgumentParser) -> None: + # sglang arguments + parser.add_argument( + "--sglang-attention-backend", + type=str, + default="flashinfer", + choices=ATTENTION_BACKEND_CHOICES, + help="The attention backend of SGLang backend", + ) + parser.add_argument( + "--sglang-mem-fraction-static", + type=float, + default=0.4, + help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", + ) + parser.add_argument( + "--sglang-context-length", + type=int, + default=None, + help="The context length of the SGLang backend", + ) + parser.add_argument( + "--sglang-enable-nccl-nvls", + action="store_true", + help="Enable NCCL NVLS for prefill heavy requests when available for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-symm-mem", + action="store_true", + help="Enable NCCL symmetric memory for fast collectives for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-torch-compile", + action="store_true", + help="Optimize the model with torch.compile for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-dp-attention", + action="store_true", + help="Enable DP attention for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-dp-lm-head", + action="store_true", + help="Enable piecewise CUDA graph for SGLang backend", + ) + parser.add_argument( + "--sglang-enable-piecewise-cuda-graph", + action="store_true", + help="Enable piecewise CUDA graph for SGLang backend's prefill", + ) + parser.add_argument( + "--sglang-piecewise-cuda-graph-max-tokens", + type=int, + default=4096, + help="Set the max tokens for piecewise CUDA graph for SGLang backend", + ) + parser.add_argument( + "--sglang-piecewise-cuda-graph-tokens", + type=int, + nargs="+", + default=None, + help="Set the list of tokens when using piecewise cuda graph for SGLang backend", + ) + parser.add_argument( + "--sglang-ep-size", + type=int, + default=1, + help="The ep size of the SGLang backend", + ) + + @staticmethod + def from_args(args: argparse.Namespace) -> "SGLangBackendArgs": + return SGLangBackendArgs( + sglang_attention_backend=args.sglang_attention_backend, + sglang_mem_fraction_static=args.sglang_mem_fraction_static, + sglang_context_length=args.sglang_context_length, + sglang_enable_nccl_nvls=args.sglang_enable_nccl_nvls, + sglang_enable_symm_mem=args.sglang_enable_symm_mem, + sglang_enable_torch_compile=args.sglang_enable_torch_compile, + sglang_enable_dp_attention=args.sglang_enable_dp_attention, + sglang_enable_dp_lm_head=args.sglang_enable_dp_lm_head, + sglang_enable_piecewise_cuda_graph=args.sglang_enable_piecewise_cuda_graph, + sglang_piecewise_cuda_graph_max_tokens=args.sglang_piecewise_cuda_graph_max_tokens, + sglang_piecewise_cuda_graph_tokens=args.sglang_piecewise_cuda_graph_tokens, + sglang_ep_size=args.sglang_ep_size, + sglang_max_running_requests=( + args.target_batch_size if hasattr(args, "target_batch_size") else None + ), + sglang_max_total_tokens=( + args.target_batch_size * args.max_length + if hasattr(args, "target_batch_size") and hasattr(args, "max_length") + else None + ), + ) + + def to_kwargs(self) -> Dict[str, Any]: + return dict( + attention_backend=self.sglang_attention_backend, + mem_fraction_static=self.sglang_mem_fraction_static, + context_length=self.sglang_context_length, + enable_nccl_nvls=self.sglang_enable_nccl_nvls, + enable_symm_mem=self.sglang_enable_symm_mem, + enable_torch_compile=self.sglang_enable_torch_compile, + enable_dp_attention=self.sglang_enable_dp_attention, + enable_dp_lm_head=self.sglang_enable_dp_lm_head, + enable_piecewise_cuda_graph=self.sglang_enable_piecewise_cuda_graph, + piecewise_cuda_graph_max_tokens=self.sglang_piecewise_cuda_graph_max_tokens, + piecewise_cuda_graph_tokens=self.sglang_piecewise_cuda_graph_tokens, + ep_size=self.sglang_ep_size, + max_running_requests=self.sglang_max_running_requests, + max_total_tokens=self.sglang_max_total_tokens, + ) diff --git a/idea1/specforge/benchmarks/benchmark_flex_attention.py b/idea1/specforge/benchmarks/benchmark_flex_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..20f989565727ffe42ab112c8818ac32371b9313d --- /dev/null +++ b/idea1/specforge/benchmarks/benchmark_flex_attention.py @@ -0,0 +1,336 @@ +import argparse +import time + +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch._dynamo as dynamo +from transformers import LlamaConfig +from transformers.cache_utils import DynamicCache + +from specforge.modeling.draft.llama3_eagle import ( + LlamaAttention, + LlamaFlexAttention, + prepare_decoder_attention_mask, +) + +dynamo.config.recompile_limit = 64 + +config_dict = { + "hidden_size": 4096, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "max_position_embeddings": 16384, + "rms_norm_eps": 1e-05, + "vocab_size": 32000, + "hidden_act": "silu", + "num_hidden_layers": 1, +} + +config = LlamaConfig(**config_dict) + +TTT_LENGTH = 7 +BATCH_SIZE = 4 +HIDDEN_SIZE = config.hidden_size * 2 + + +def run_attention( + seq_len: int, + hidden_states_list: list[torch.Tensor], + attention_backend: str = "sdpa", + enable_profile: bool = False, +): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + batch_size = hidden_states_list[0].shape[0] + # Initialize cache and attention function based on backend + if attention_backend == "sdpa": + cache_hidden = [[], []] + past_key_values = None + attn_func = LlamaAttention(config).to(device).to(torch.bfloat16) + elif attention_backend == "flex_attention": + cache_hidden = None + past_key_values = DynamicCache() + attn_func = LlamaFlexAttention(config).to(device).to(torch.bfloat16) + else: + raise ValueError(f"Unknown attention backend: {attention_backend}") + + # Simulate inputs - move to device + position_ids = torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1).to(device) + input_embeds = torch.randn(batch_size, seq_len, config.hidden_size).to(device) + attention_mask = torch.ones(batch_size, seq_len).to(device) + decoder_attention_mask = prepare_decoder_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_len), + inputs_embeds=input_embeds, + past_key_values_length=0, + ) + + loss_list = [] + + if attention_backend == "flex_attention" and enable_profile: + profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + f"./profiler_logs/{attention_backend}" + ), + record_shapes=False, + profile_memory=False, + with_stack=True, + with_modules=False, + ) + profiler.start() + for idx in range(TTT_LENGTH): + is_last = idx == TTT_LENGTH - 1 + hidden_states = hidden_states_list[idx] + # Call attention function with appropriate parameters + if attention_backend == "sdpa": + output = attn_func( + hidden_states=hidden_states, + attention_mask=decoder_attention_mask, + position_ids=position_ids, + cache_hidden=cache_hidden, + output_attentions=False, + use_cache=True, + ) + else: # flex_attention + output = attn_func( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=False, + use_cache=True, + ) + + # Compute a simple loss for benchmarking + loss = output[0].sum() + loss_list.append(loss) + + # Compute mean loss and backward pass + if loss_list: + mean_loss = sum(loss_list) / len(loss_list) + mean_loss.backward() + + if attention_backend == "flex_attention" and enable_profile: + profiler.stop() + + +def benchmark_function( + attention_backend: str, + seq_lengths: list, + enable_profile: bool = False, + enable_warmup: bool = True, +): + """Benchmark a function for speed and GPU memory usage per sequence length.""" + print(f"\n=== Benchmarking {attention_backend} ===") + + results_per_seq_len = [] + + for seq_len in seq_lengths: + print(f"\nTesting sequence length: {seq_len}") + + # Clear GPU cache + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + # Warm up runs for this sequence length + if enable_warmup: + print("Warming up...") + for _ in range(2): + hidden_states = [ + torch.randn( + BATCH_SIZE, + seq_len, + HIDDEN_SIZE, + requires_grad=True, + device="cuda", + dtype=torch.bfloat16, + ) + for _ in range(TTT_LENGTH) + ] + run_attention(seq_len, hidden_states, attention_backend) + # Clear cache again after warmup + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + # Record initial memory + initial_memory = 0 + if torch.cuda.is_available(): + initial_memory = torch.cuda.memory_allocated() + hidden_states = [ + torch.randn( + BATCH_SIZE, + seq_len, + HIDDEN_SIZE, + requires_grad=True, + device="cuda", + dtype=torch.bfloat16, + ) + for _ in range(TTT_LENGTH) + ] + start_time = time.time() + run_attention( + seq_len, + hidden_states, + attention_backend, + enable_profile and seq_len == seq_lengths[0], + ) + if torch.cuda.is_available(): + torch.cuda.synchronize() + end_time = time.time() + + # Record memory usage + peak_memory = 0 + current_memory = 0 + if torch.cuda.is_available(): + peak_memory = torch.cuda.max_memory_allocated() + current_memory = torch.cuda.memory_allocated() + results_per_seq_len.append( + { + "seq_len": seq_len, + "time": end_time - start_time, + "peak_memory": peak_memory, + "memory_increase": current_memory - initial_memory, + } + ) + + print(f" Time: {end_time - start_time:.3f}s") + print(f" Peak memory: {peak_memory / 1024**3:.3f} GB") + print( + f" Memory increase: {(current_memory - initial_memory) / 1024**3:.3f} GB" + ) + + return results_per_seq_len + + +def plot_results(eagle_results, flex_results, seq_lengths): + """Plot speed and memory comparison between Eagle and Flex attention.""" + + # Extract data for plotting + eagle_times = [r["time"] for r in eagle_results] + flex_times = [r["time"] for r in flex_results] + eagle_memory = [r["peak_memory"] / 1024**3 for r in eagle_results] # Convert to GB + flex_memory = [r["peak_memory"] / 1024**3 for r in flex_results] # Convert to GB + + # Create subplots + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) + + # Speed comparison plot + ax1.plot( + seq_lengths, eagle_times, "b-o", label="Eagle (SDPA)", linewidth=2, markersize=8 + ) + ax1.plot( + seq_lengths, + flex_times, + "r-s", + label="Flex Attention", + linewidth=2, + markersize=8, + ) + ax1.set_xlabel("Sequence Length") + ax1.set_ylabel("Time (seconds)") + ax1.set_title("Speed Comparison: Eagle vs Flex Attention") + ax1.legend() + ax1.grid(True, alpha=0.3) + ax1.set_xscale("linear") + ax1.set_yscale("log") + + # Memory comparison plot + ax2.plot( + seq_lengths, + eagle_memory, + "b-o", + label="Eagle (SDPA)", + linewidth=2, + markersize=8, + ) + ax2.plot( + seq_lengths, + flex_memory, + "r-s", + label="Flex Attention", + linewidth=2, + markersize=8, + ) + ax2.set_xlabel("Sequence Length") + ax2.set_ylabel("Peak Memory (GB)") + ax2.set_title("Memory Usage Comparison: Eagle vs Flex Attention") + ax2.legend() + ax2.grid(True, alpha=0.3) + + # Set y-axis ticks every 10GB + max_memory = max(max(eagle_memory), max(flex_memory)) + ax2.set_yticks(np.arange(0, max_memory + 10, 10)) + + plt.tight_layout() + plt.savefig("attention_benchmark_comparison.png", dpi=300, bbox_inches="tight") + plt.show() + + # Print summary statistics + print(f"\n=== Performance Summary ===") + print(f"Sequence lengths tested: {seq_lengths}") + print(f"\nSpeed ratios (Eagle/Flex):") + for i, seq_len in enumerate(seq_lengths): + ratio = eagle_times[i] / flex_times[i] if flex_times[i] > 0 else float("inf") + print(f" {seq_len:4d}: {ratio:.2f}x") + + print(f"\nMemory ratios (Eagle/Flex):") + for i, seq_len in enumerate(seq_lengths): + ratio = eagle_memory[i] / flex_memory[i] if flex_memory[i] > 0 else float("inf") + print(f" {seq_len:4d}: {ratio:.2f}x") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark attention mechanisms") + parser.add_argument( + "--enable-profile", action="store_true", help="Enable profiling" + ) + args = parser.parse_args() + + print("PyTorch version:", torch.__version__) + if torch.cuda.is_available(): + print("CUDA available:", torch.cuda.is_available()) + print("GPU:", torch.cuda.get_device_name()) + print( + "GPU memory:", + torch.cuda.get_device_properties(0).total_memory / 1024**3, + "GB", + ) + else: + print("CUDA not available - running on CPU") + + # Define sequence lengths to test + seq_lengths = [128 * i for i in range(1, 28, 4)] + # Add extra long context + seq_lengths.extend([16384, 32768]) + + print(f"Testing sequence lengths: {seq_lengths}") + + # Run benchmarks + print("\n" + "=" * 50) + # Truncate seqlen after 2560 since naive eagle goes OOM + eagle_seq_lengths = [seq_len for seq_len in seq_lengths if seq_len <= 2560] + eagle_results = benchmark_function("sdpa", eagle_seq_lengths) + print("\n" + "=" * 50) + flex_results = benchmark_function( + "flex_attention", seq_lengths, enable_profile=args.enable_profile + ) + # Pad the memory usage on eagle to max memory 80GB when data not available + max_time = max(result["time"] for result in flex_results) + for result in flex_results: + if result["seq_len"] not in eagle_seq_lengths: + eagle_results.append( + { + "seq_len": result["seq_len"], + "time": max_time, + "peak_memory": 80 * 1024**3, + "memory_increase": 0, # Not used in plotting + } + ) + + # Plot results + plot_results(eagle_results, flex_results, seq_lengths) diff --git a/idea1/specforge/benchmarks/benchmark_loss.py b/idea1/specforge/benchmarks/benchmark_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..940787a860d98ee406bee6a29df9127bae675d92 --- /dev/null +++ b/idea1/specforge/benchmarks/benchmark_loss.py @@ -0,0 +1,179 @@ +import argparse +import time + +import torch + +from specforge.core.loss import LogSoftmaxLoss, _compute_loss + +TTT_LENGTH = 7 + + +def benchmark_loss_method( + loss_method: str, + test_configs: list, +): + """Benchmark a loss computation method for speed and GPU memory usage.""" + print(f"\n=== Benchmarking {loss_method} Loss ===") + + results = [] + + for config in test_configs: + B, T, V = config + print(f"\nTesting config: B={B}, T={T}, V={V}") + + # Clear GPU cache + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + # Create tensors outside timing measurement + target = torch.softmax( + torch.randn(B, T, V, device="cuda", dtype=torch.float32), dim=-1 + ) + position_mask = torch.ones((B, T, 1), dtype=torch.bool, device="cuda") + + # Pre-allocate logits tensors for each TTT step + logits_list = [] + for i in range(TTT_LENGTH): + logits = torch.randn( + B, T, V, device="cuda", requires_grad=True, dtype=torch.float32 + ) + logits_list.append(logits) + + torch.cuda.synchronize() # Ensure all operations are complete + start_time = time.time() + + plosses = [] + for i in range(TTT_LENGTH): + logits = logits_list[i] + if loss_method == "triton": + loss = LogSoftmaxLoss.apply(logits, target, position_mask) + else: + loss = _compute_loss(logits, target, position_mask) + plosses.append(loss) + + ploss_weight = [0.8**i for i in range(len(plosses))] + ploss = ( + sum([ploss_weight[i] * plosses[i] for i in range(len(plosses))]) + / TTT_LENGTH + ) + ploss.backward() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + + end_time = time.time() + total_time = end_time - start_time + # Record memory usage + peak_memory = 0 + if torch.cuda.is_available(): + peak_memory = torch.cuda.max_memory_allocated() + + results.append( + { + "B": B, + "T": T, + "V": V, + "time_total": total_time, + "peak_memory": peak_memory, + } + ) + + print(f" Total time (forward + backward): {total_time*1000:.3f}ms") + print(f" Peak memory: {peak_memory / 1024**3:.3f} GB") + + return results + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark loss computation methods") + parser.add_argument( + "--num-runs", type=int, default=5, help="Number of runs for averaging" + ) + args = parser.parse_args() + + print("PyTorch version:", torch.__version__) + if torch.cuda.is_available(): + print("CUDA available:", torch.cuda.is_available()) + print("GPU:", torch.cuda.get_device_name()) + print( + "GPU memory:", + torch.cuda.get_device_properties(0).total_memory / 1024**3, + "GB", + ) + else: + print("CUDA not available - running on CPU") + + # Define test configurations (B, T, V) + test_configs = [ + (1, 1024, 32000), + (1, 1024, 64000), + (1, 4096, 32000), + (1, 4096, 64000), + (1, 8192, 32000), + (1, 8192, 64000), + (1, 16384, 32000), + ] + + print(f"Testing configurations: {test_configs}") + + # Run benchmarks + print("\n" + "=" * 60) + pytorch_results = benchmark_loss_method("pytorch", test_configs) + + print("\n" + "=" * 60) + triton_results = benchmark_loss_method("triton", test_configs) + + # Print results summary + print(f"\n=== Performance Summary ===") + print(f"Configurations tested: {len(test_configs)}") + + # Print detailed results table + print( + f"\n{'Config (B,T,V)':<15} {'PyTorch (ms)':<15} {'Triton (ms)':<15} {'Speedup':<10} {'PyTorch Mem (GB)':<18} {'Triton Mem (GB)':<15} {'Memory Save':<12}" + ) + print("-" * 115) + + for i, config in enumerate(test_configs): + B, T, V = config + config_str = f"({B},{T},{V})" + + pytorch_result = next( + (r for r in pytorch_results if r["B"] == B and r["T"] == T and r["V"] == V), + None, + ) + triton_result = next( + (r for r in triton_results if r["B"] == B and r["T"] == T and r["V"] == V), + None, + ) + + if pytorch_result and triton_result: + pytorch_time_str = f"{pytorch_result['time_total']*1000:.2f}" + pytorch_mem_str = f"{pytorch_result['peak_memory']/1024**3:.2f}" + + triton_time_str = f"{triton_result['time_total']*1000:.2f}" + triton_mem_str = f"{triton_result['peak_memory']/1024**3:.2f}" + + if triton_result["time_total"] > 0: + speedup = pytorch_result["time_total"] / triton_result["time_total"] + speedup_str = f"{speedup:.2f}x" + else: + speedup_str = "N/A" + + # Calculate memory savings percentage + if pytorch_result["peak_memory"] > 0: + memory_save_pct = ( + (pytorch_result["peak_memory"] - triton_result["peak_memory"]) + / pytorch_result["peak_memory"] + ) * 100 + memory_save_str = f"{memory_save_pct:.1f}%" + else: + memory_save_str = "N/A" + + print( + f"{config_str:<15} {pytorch_time_str:<15} {triton_time_str:<15} {speedup_str:<10} {pytorch_mem_str:<18} {triton_mem_str:<15} {memory_save_str:<12}" + ) + + +if __name__ == "__main__": + main() diff --git a/idea1/specforge/core/__init__.py b/idea1/specforge/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1b45f4f7a9500ca6a5f04496149e28fb48d2405a --- /dev/null +++ b/idea1/specforge/core/__init__.py @@ -0,0 +1,8 @@ +from .dflash import OnlineDFlashModel +from .eagle3 import OnlineEagle3Model, QwenVLOnlineEagle3Model + +__all__ = [ + "OnlineDFlashModel", + "OnlineEagle3Model", + "QwenVLOnlineEagle3Model", +] diff --git a/idea1/specforge/core/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/core/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39b0ae92e9b5ac1ca482c52b2a36d7686fdce864 Binary files /dev/null and b/idea1/specforge/core/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/core/__pycache__/__init__.cpython-313.pyc b/idea1/specforge/core/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0580d41d8f6c51dfcc1202a13f99dcc0d6ae30ee Binary files /dev/null and b/idea1/specforge/core/__pycache__/__init__.cpython-313.pyc differ diff --git a/idea1/specforge/core/__pycache__/dflash.cpython-311.pyc b/idea1/specforge/core/__pycache__/dflash.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df8f1f2483c189aa218d527bf1515eaefa1e76b0 Binary files /dev/null and b/idea1/specforge/core/__pycache__/dflash.cpython-311.pyc differ diff --git a/idea1/specforge/core/__pycache__/dflash.cpython-313.pyc b/idea1/specforge/core/__pycache__/dflash.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e455d73c8dc2b3c8f79da3a96aece6ea6f0b4a0f Binary files /dev/null and b/idea1/specforge/core/__pycache__/dflash.cpython-313.pyc differ diff --git a/idea1/specforge/core/__pycache__/eagle3.cpython-311.pyc b/idea1/specforge/core/__pycache__/eagle3.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e02020cde309d258d0fd856760da896e6b664bbf Binary files /dev/null and b/idea1/specforge/core/__pycache__/eagle3.cpython-311.pyc differ diff --git a/idea1/specforge/core/__pycache__/eagle3_adapters.cpython-311.pyc b/idea1/specforge/core/__pycache__/eagle3_adapters.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..804b778fb381384d39e33f5c2d6c2238f6325437 Binary files /dev/null and b/idea1/specforge/core/__pycache__/eagle3_adapters.cpython-311.pyc differ diff --git a/idea1/specforge/core/__pycache__/loss.cpython-311.pyc b/idea1/specforge/core/__pycache__/loss.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9f7a1b40f7ff1dec0f9880286c22122fbdbc81d Binary files /dev/null and b/idea1/specforge/core/__pycache__/loss.cpython-311.pyc differ diff --git a/idea1/specforge/core/dflash.py b/idea1/specforge/core/dflash.py new file mode 100644 index 0000000000000000000000000000000000000000..35a5dd8faa273a2d82afb7693ba993e4cfad8c6b --- /dev/null +++ b/idea1/specforge/core/dflash.py @@ -0,0 +1,318 @@ +# coding=utf-8 +"""DFlash Training Wrapper.""" + +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from specforge.modeling.draft.dflash import DFlashDraftModel + +try: + from torch.nn.attention.flex_attention import BlockMask, create_block_mask + + FLEX_ATTENTION_AVAILABLE = True +except ImportError: + FLEX_ATTENTION_AVAILABLE = False + BlockMask = None + create_block_mask = None + + +def create_dflash_sdpa_mask(anchor_positions, block_keep_mask, S, block_size, device): + B, N = anchor_positions.shape + Q_LEN = N * block_size + KV_LEN = S + N * block_size + + q_indices = torch.arange(Q_LEN, device=device).view(1, 1, -1, 1) # (1, 1, Q_LEN, 1) + kv_indices = torch.arange(KV_LEN, device=device).view( + 1, 1, 1, -1 + ) # (1, 1, 1, KV_LEN) + + q_block_ids = q_indices // block_size + + anchor_expanded = anchor_positions.view(B, 1, N, 1).repeat_interleave( + block_size, dim=2 + ) + + mask_context = (kv_indices < S) & (kv_indices < anchor_expanded) + + is_draft = kv_indices >= S + kv_block_ids = (kv_indices - S) // block_size + mask_draft = is_draft & (q_block_ids == kv_block_ids) + + valid_block = block_keep_mask.view(B, 1, N, 1).repeat_interleave(block_size, dim=2) + + final_mask = (mask_context | mask_draft) & valid_block + return final_mask + + +def create_dflash_block_mask( + anchor_positions: torch.Tensor, + block_keep_mask: torch.Tensor, + S: int, + block_size: int, + device: torch.device, +): + """Construct Flex Attention BlockMask for DFlash training. + + KV: [Context (S tokens) | Block_0 | Block_1 | ... | Block_{n-1}] + Q: [Block_0 | Block_1 | ... | Block_{n-1}] + + Rules: + 1. Each block sees context strictly before its anchor (kv_idx < anchor_pos). + 2. Intra-block attention is bidirectional. + 3. Different blocks are invisible to each other. + 4. Invalid blocks (block_keep_mask=False) see nothing. + """ + + def dflash_mask_mod(b, h, q_idx, kv_idx): + q_block_id = q_idx // block_size + safe_q_block_id = q_block_id.clamp(max=N - 1) + anchor_pos = anchor_positions[b, safe_q_block_id] + + is_context = kv_idx < S + # Strictly less than: matches inference where target_hidden[anchor_pos] + # is not available as context. + mask_context = is_context & (kv_idx < anchor_pos) + + is_draft = kv_idx >= S + kv_block_id = (kv_idx - S) // block_size + mask_draft = is_draft & (q_block_id == kv_block_id) + + is_valid_block = block_keep_mask[b, safe_q_block_id] + in_bounds = q_block_id < N + return (mask_context | mask_draft) & is_valid_block & in_bounds + + B, N = anchor_positions.shape + Q_LEN = N * block_size + KV_LEN = S + N * block_size + + return create_block_mask( + dflash_mask_mod, B=B, H=None, Q_LEN=Q_LEN, KV_LEN=KV_LEN, device=device + ) + + +class OnlineDFlashModel(nn.Module): + """DFlash online training wrapper with block-wise CE loss.""" + + def __init__( + self, + draft_model: DFlashDraftModel, + target_lm_head: nn.Module, + target_embed_tokens: nn.Module, + mask_token_id: int, + block_size: int = 16, + attention_backend: str = "flex_attention", + num_anchors: int = 512, + loss_decay_gamma: Optional[float] = None, + ): + super().__init__() + self.draft_model = draft_model + self.lm_head = target_lm_head + self.embed_tokens = target_embed_tokens + self.block_size = block_size + self.mask_token_id = mask_token_id + self.attention_backend = attention_backend + self.num_anchors = num_anchors + self.loss_decay_gamma = loss_decay_gamma + + self._cached_block_mask: Optional[BlockMask] = None + self._cached_seq_len: Optional[int] = None + self._cached_bsz: Optional[int] = None + + def _sample_anchor_positions( + self, seq_len: int, loss_mask: torch.Tensor, device: torch.device + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Randomly sample anchor positions per sample; returns (anchors, keep_mask).""" + bs = self.block_size + bsz = loss_mask.shape[0] + max_anchor = max(seq_len - bs, 0) + + valid = loss_mask[:, : max_anchor + 1] > 0.5 + valid_counts = valid.sum(dim=1) + max_n = min(self.num_anchors, int(valid_counts.max().item()) - 1) + + if max_n <= 0: + raise ValueError("should preprocess the data.") + + indices = ( + torch.arange(max_anchor + 1, device=device).unsqueeze(0).expand(bsz, -1) + ) + masked_indices = torch.where( + valid, indices, torch.tensor(seq_len + 1, device=device) + ) + + random_vals = torch.rand(bsz, max_anchor + 1, device=device) + random_vals = torch.where(valid, random_vals, torch.tensor(2.0, device=device)) + + _, sorted_idx = random_vals.sort(dim=1) + gathered = torch.gather(masked_indices, 1, sorted_idx) + anchors = gathered[:, :max_n].sort(dim=1).values + + keep_mask = torch.arange(max_n, device=device).unsqueeze( + 0 + ) < valid_counts.unsqueeze(1).clamp(max=max_n) + anchors = torch.where( + keep_mask, anchors, torch.tensor(0, dtype=torch.long, device=device) + ) + + return anchors, keep_mask + + def prepare_noise_input( + self, input_ids: torch.Tensor, block_ids: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """Prepare noise input: first token of each block is real, rest are MASK.""" + bsz, seq_len = input_ids.shape + device = input_ids.device + + if block_ids is not None: + is_block_start = torch.ones(bsz, seq_len, dtype=torch.bool, device=device) + is_block_start[:, 1:] = block_ids[:, 1:] != block_ids[:, :-1] + else: + positions = torch.arange(seq_len, device=device) + is_block_start = (positions % self.block_size) == 0 + is_block_start = is_block_start.unsqueeze(0).expand(bsz, -1) + + noise_input_ids = torch.full_like(input_ids, self.mask_token_id) + noise_input_ids[is_block_start] = input_ids[is_block_start] + return noise_input_ids + + def _create_position_ids(self, anchor_positions: torch.Tensor) -> torch.Tensor: + """Create absolute position IDs for parallel draft blocks.""" + bsz, n_blocks = anchor_positions.shape + device = anchor_positions.device + offsets = torch.arange(self.block_size, device=device).view(1, 1, -1) + pos_ids = anchor_positions.unsqueeze(-1) + offsets + return pos_ids.view(bsz, -1) + + def _create_noise_embed(self, input_ids, anchor_positions, block_keep_mask): + bsz, seq_len = input_ids.shape + n = anchor_positions.shape[1] + bs = self.block_size + device = input_ids.device + + noise_ids = torch.full( + (bsz, n * bs), self.mask_token_id, dtype=torch.long, device=device + ) + + block_starts = torch.arange(n, device=device) * bs + block_starts = block_starts.unsqueeze(0).expand(bsz, -1) + + valid_anchor_positions = anchor_positions.clamp(0, seq_len - 1) + anchor_tokens = torch.gather(input_ids, 1, valid_anchor_positions) + + flat_batch_idx = torch.arange(bsz, device=device).unsqueeze(1).expand(bsz, n) + noise_ids[flat_batch_idx, block_starts] = torch.where( + block_keep_mask, + anchor_tokens, + torch.tensor(self.mask_token_id, dtype=torch.long, device=device), + ) + + return self.embed_tokens(noise_ids) + + def forward( + self, + input_ids: torch.Tensor, + hidden_states: torch.Tensor, + loss_mask: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Parallel block-wise training forward pass.""" + bsz, seq_len = input_ids.shape + device = input_ids.device + + anchor_positions, block_keep_mask = self._sample_anchor_positions( + seq_len, loss_mask, device + ) + + noise_embedding = self._create_noise_embed( + input_ids, anchor_positions, block_keep_mask + ) + + context_position_ids = ( + torch.arange(seq_len, device=device).unsqueeze(0).expand(bsz, -1) + ) + draft_position_ids = self._create_position_ids(anchor_positions) + full_position_ids = torch.cat([context_position_ids, draft_position_ids], dim=1) + + if self.attention_backend == "flex_attention": + dflash_attn_mask = create_dflash_block_mask( + anchor_positions=anchor_positions, + block_keep_mask=block_keep_mask, + S=seq_len, + block_size=self.block_size, + device=device, + ) + else: + dflash_attn_mask = create_dflash_sdpa_mask( + anchor_positions=anchor_positions, + block_keep_mask=block_keep_mask, + S=seq_len, + block_size=self.block_size, + device=device, + ) + + output_hidden = self.draft_model( + position_ids=full_position_ids, + noise_embedding=noise_embedding, + target_hidden=hidden_states, + attention_mask=dflash_attn_mask, + ) + + logits = self.lm_head(output_hidden) + + # --- Labels: same-position prediction (position k predicts token anchor+k) --- + label_offsets = torch.arange(0, self.block_size, device=device).view(1, 1, -1) + label_indices = anchor_positions.unsqueeze(-1) + label_offsets + valid_label_mask = label_indices < seq_len + safe_label_indices = label_indices.clamp(max=seq_len - 1) + + target_ids = torch.gather( + input_ids.unsqueeze(1).expand(-1, anchor_positions.size(1), -1), + 2, + safe_label_indices, + ) + + # --- Weight mask: block validity * bounds * exclude anchor (pos 0) * loss_mask --- + weight_mask = ( + block_keep_mask.unsqueeze(-1).expand(-1, -1, self.block_size).float() + ) + weight_mask = weight_mask * valid_label_mask.float() + + pos_in_block = torch.arange(self.block_size, device=device).view(1, 1, -1) + weight_mask = weight_mask * (pos_in_block > 0).float() + + original_loss_mask_gathered = torch.gather( + loss_mask.unsqueeze(1).expand(-1, anchor_positions.size(1), -1), + 2, + safe_label_indices, + ) + weight_mask = weight_mask * original_loss_mask_gathered + + binary_eval_mask = weight_mask.view(-1) + + # --- Loss decay: exp(-(k-1)/γ) so k=1 (1st prediction) gets weight 1.0 --- + if self.loss_decay_gamma is not None and self.loss_decay_gamma > 0: + k = torch.arange(self.block_size, device=device).view(1, 1, -1) + decay_weights = torch.exp( + -(k - 1).clamp(min=0).float() / self.loss_decay_gamma + ) + weight_mask = weight_mask * decay_weights + + # --- Cross entropy --- + flat_logits = logits.view(-1, logits.size(-1)) + flat_targets = target_ids.view(-1) + flat_weights = weight_mask.view(-1) + + loss_per_token = F.cross_entropy(flat_logits, flat_targets, reduction="none") + valid_token_count = flat_weights.sum() + 1e-6 + loss = (loss_per_token * flat_weights).sum() / valid_token_count + + # --- Accuracy --- + with torch.no_grad(): + pred_ids = torch.argmax(flat_logits, dim=-1) + correct = (pred_ids == flat_targets) & (binary_eval_mask > 0.5) + actual_token_count = binary_eval_mask.sum() + 1e-6 + accuracy = correct.sum().float() / actual_token_count + + return loss, accuracy diff --git a/idea1/specforge/core/eagle3.py b/idea1/specforge/core/eagle3.py new file mode 100644 index 0000000000000000000000000000000000000000..1e2f04e7ea4426e1390dbad0bc90464bd6d84239 --- /dev/null +++ b/idea1/specforge/core/eagle3.py @@ -0,0 +1,606 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in HuggingFace Transformers. +# Portions of this code are adapted from: +# - https://github.com/EleutherAI/gpt-neox (Apache License 2.0) +# - https://github.com/huggingface/transformers (Apache License 2.0) +# - https://github.com/SafeAILab/EAGLE (Apache License 2.0) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.cache_utils import DynamicCache + +from specforge.core.eagle3_adapters import BackendAdapter, SdpaLikeAdapter, UspAdapter +from specforge.core.loss import LogSoftmaxLoss +from specforge.modeling.draft import Eagle3DraftModel +from specforge.utils import padding + + +class Eagle3Model(nn.Module): + pass + + +class OnlineEagle3Model(Eagle3Model): + """ + In sgl-spec, we implement offline/online training. + Online training means we have the target hidden_states available during training. + Eagle3 using test time training technique (TTT) to train the draft model. + 1. We first extract the hidden states from the target model. + 2. Then concatenate the hidden states from 3 aux layers (layer 1, layer num_layers//2, layer num_layers-4). + 3. We project the concatenated hidden states to the target hidden size. from (batch, seq_len, 3*hidden_size) to (batch, seq_len, hidden_size) + 4. We concat the projected hidden states and embedding output as the input for the draft model. + 5. finally, we run TTT to train the draft model. input size is (batch, seq_len, hidden_size * 2) + """ + + def __init__( + self, + draft_model: Eagle3DraftModel, + length: int = 7, + attention_backend="sdpa", + target_model: Optional[Eagle3Model] = None, + ): + """ + Args: + target_model: the target model to extract hidden states. + draft_model: the draft model to be trained. + length: TTT length, it means how many turns to unroll during TTT. + """ + super().__init__() + self.draft_model = draft_model + self.length = length + self.attention_backend = attention_backend + self.target_model = target_model + + def _make_adapter(self) -> BackendAdapter: + if self.attention_backend == "usp": + return UspAdapter(self) + return SdpaLikeAdapter(self) + + def _acc_and_loss( + self, + *, + logits: torch.Tensor, + target_p: torch.Tensor, + position_mask: torch.Tensor, + loss_mask: torch.Tensor, + adapter: BackendAdapter, + ) -> Tuple[torch.Tensor, torch.Tensor]: + with torch.no_grad(): + local_correct = ( + (logits.argmax(-1) == target_p.argmax(-1)) * position_mask.squeeze(-1) + ).sum() + local_denom = loss_mask.sum().clamp_min(1e-6) + local_correct, local_denom = adapter.reduce_metrics( + local_correct=local_correct, local_denom=local_denom + ) + acc = local_correct / local_denom + + loss = LogSoftmaxLoss.apply(logits, target_p, position_mask) + loss = adapter.reduce_loss(loss) + return acc, loss + + def _prepare_position_ids( + self, + position_ids: Optional[torch.Tensor], + *, + seq_length: int, + past_key_values_length: int, + device: torch.device, + is_vlm: bool, + input_ids: torch.Tensor, + image_grid_thw: Optional[torch.Tensor], + ) -> torch.Tensor: + if self.attention_backend == "usp": + return position_ids + if position_ids is None: + if is_vlm: + mrope_positions_ids, _ = self.target_model.get_rope_index( + input_ids=input_ids, image_grid_thw=image_grid_thw + ) + return mrope_positions_ids + return ( + torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + .unsqueeze(0) + .view(-1, seq_length) + ) + + position_ids = position_ids.long() + return position_ids.view(-1, seq_length) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + target: torch.Tensor, + loss_mask: torch.Tensor, + hidden_states: torch.Tensor, + past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + is_vlm: bool = False, + **kwargs, + ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: + """ + Online eagle model trainer, modified from: https://github.com/SafeAILab/EAGLE/blob/main/eagle/traineagle3/cnets.py#L711 + + Args: + input_ids: (batch, seq_len) + attention_mask: (batch, seq_len) + loss_mask: (batch, seq_len) + past_key_values: We dont use this past_key_values in eagle3, but keep it for compatibility. We control kvcache by cache_hidden. + position_ids: (batch, seq_len) + """ + # Step 1: handle vocab size + target_p_padded, position_mask = _compute_target_p_padded( + target=target, + t2d=self.draft_model.t2d, + loss_mask=loss_mask, + length=self.length, + ) + del target + torch.cuda.empty_cache() + + # basic info + batch_size, seq_length, _ = hidden_states.shape + seq_length_with_past = seq_length + past_key_values_length = 0 + + # Step 2: project the concatenated hidden states to the target hidden size + hidden_states = self.draft_model.project_hidden_states(hidden_states) + + # Step 3: process kv cache, position ids and position ids + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + position_ids = self._prepare_position_ids( + position_ids=position_ids, + seq_length=seq_length, + past_key_values_length=past_key_values_length, + device=hidden_states.device, + is_vlm=is_vlm, + input_ids=input_ids, + image_grid_thw=image_grid_thw, + ) + + # Step 4: handle attention mask + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), + dtype=torch.bool, + device=hidden_states.device, + ) + if self.attention_backend == "sdpa": + attention_mask = self.draft_model.prepare_decoder_attention_mask( + attention_mask=attention_mask, + hidden_states=hidden_states, + batch_size=batch_size, + seq_length=seq_length, + past_key_values_length=past_key_values_length, + ) + + # Step 5: run TTT + plosses = [] + vlosses = [] + acces = [] + adapter = self._make_adapter() + # for sequence paralle, position mask and input ids will split by sequence dim, need to keep origin for ttt shift + global_input_ids = input_ids + if self.attention_backend in ["sdpa", "fa", "usp"]: + cache_hidden = [[], []] + past_key_values = None + elif self.attention_backend == "flex_attention": + cache_hidden = None + past_key_values = DynamicCache() + else: + raise ValueError(f"Unknown attention backend: {self.attention_backend}") + + for idx in range(self.length): + state = adapter.step_view( + idx=idx, + ttt_length=self.length, + global_input_ids=global_input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + position_ids=position_ids, + hidden_states=hidden_states, + target_p_padded=target_p_padded, + position_mask=position_mask, + seq_length=seq_length, + ) + is_last = idx == self.length - 1 + + # Step 5.1: embed the input ids + inputs_embeds = self.draft_model.embed_input_ids(state.input_ids) + inputs_embeds = inputs_embeds.to(hidden_states.dtype) + + # Step 5.2: run the draft model backbone + hidden_states_out = self.draft_model.backbone( + input_embeds=inputs_embeds, + hidden_states=state.hidden_states, + cache_hidden=cache_hidden, + attention_mask=state.attention_mask, + position_ids=state.position_ids, + past_key_values=past_key_values, + use_cache=True, + ) + + # update hidden states for next step + hidden_states = hidden_states_out + + # Step 5.4: get logits + logits = self.draft_model.compute_logits(hidden_states) + + # Step 5.5 + 5.6: metric and loss + acc, loss = self._acc_and_loss( + logits=logits, + target_p=state.target_p, + position_mask=state.position_mask, + loss_mask=state.loss_mask, + adapter=adapter, + ) + acces.append(acc) + plosses.append(loss) + + if not is_last: + # Step 5.7: we need to update the loss mask + global_input_ids = padding(global_input_ids, left=False) + position_mask = padding(position_mask, left=False) + loss_mask = padding(loss_mask, left=False) + # Flex attention mask shirnking is handled inside attention module + return plosses, vlosses, acces + + +class QwenVLOnlineEagle3Model(Eagle3Model): + """ + In sgl-spec, we implement offline/online training. + Online training means we have the target hidden_states available during training. + Eagle3 using test time training technique (TTT) to train the draft model. + 1. We first extract the hidden states from the target model. + 2. Then concatenate the hidden states from 3 aux layers (layer 1, layer num_layers//2, layer num_layers-4). + 3. We project the concatenated hidden states to the target hidden size. from (batch, seq_len, 3*hidden_size) to (batch, seq_len, hidden_size) + 4. We concat the projected hidden states and embedding output as the input for the draft model. + 5. finally, we run TTT to train the draft model. input size is (batch, seq_len, hidden_size * 2) + """ + + def __init__( + self, + target_model, + draft_model: Eagle3DraftModel, + processor, + length: int = 7, + attention_backend: str = "sdpa", + ): + """ + Args: + target_model: the target model to extract hidden states. + draft_model: the draft model to be trained. + length: TTT length, it means how many turns to unroll during TTT. + """ + super().__init__() + self.target_model = target_model + self.draft_model = draft_model + self.processor = processor + self.length = length + self.attention_backend = attention_backend + + @torch.no_grad() + def _prepare_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + device: Optional[torch.device] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + modified from: https://github.com/SafeAILab/EAGLE/blob/main/eagle/traineagle3/cnets.py#L692 + Extract the hidden states from the target model outputs. + + Args: + input_ids: (batch, seq_len) + attention_mask: (batch, seq_len) + loss_mask: (batch, seq_len) + device: the device to run the target model, if None, use the input_ids device + pixel_values: image pixel values, used for VLM models + image_grid_thw: image grid thw, used for VLM models + + Returns: + hidden_states: (batch, seq_len, 3*hidden_size) + target: (batch, seq_len, vocab_size) + loss_mask: (batch, seq_len) + input_ids: (batch, seq_len) + """ + + if device is None: + device = input_ids.device + + # run the target model to get the hidden states + outputs = self.target_model( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + output_hidden_states=True, + use_cache=False, + ) + + # extract the aux hidden states + # output_hidden_states = True will return the embedding output as well + # so we have an offset of 1 + num_hidden_states = len(outputs.hidden_states) + offset = 1 + num_layers = num_hidden_states - 1 + + # Eagle3 uses 3 aux layers from layer 1, num_layers//2, num_layers-4 + low_aux_layer = 1 + offset + mid_aux_layer = num_layers // 2 - 1 + offset + last_aux_layer = num_layers - 4 + offset + + hidden_states0 = outputs.hidden_states[low_aux_layer] + hidden_states1 = outputs.hidden_states[mid_aux_layer] + hidden_states2 = outputs.hidden_states[last_aux_layer] + + hidden_states = torch.cat( + (hidden_states0, hidden_states1, hidden_states2), dim=-1 + ) + + # apply pading + target = outputs.logits + target = padding(target, left=False) + input_ids = padding(input_ids, left=False) + + if target is not None: + target = target.to(device) + loss_mask = loss_mask[..., None] + loss_mask = loss_mask.to(device) + + return hidden_states, target, loss_mask, input_ids + + @torch.no_grad() + def _get_input_embeds( + self, + input_ids: torch.Tensor, + pixel_values: torch.Tensor, + image_grid_thw: torch.Tensor, + ) -> torch.Tensor: + # get input embeding with image + # inputs_embeds = self.target_model.model.get_input_embeddings()(input_ids) + inputs_embeds = self.draft_model.embed_input_ids(input_ids) + image_embeds = self.target_model.model.get_image_features( + pixel_values, image_grid_thw + ) + image_embeds = torch.cat(image_embeds, dim=0) + n_image_tokens = ( + input_ids == self.target_model.model.config.image_token_id + ).sum() + n_image_features = image_embeds.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + mask = input_ids == self.target_model.model.config.image_token_id + mask_unsqueezed = mask.unsqueeze(-1) + mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) + image_mask = mask_expanded.to(inputs_embeds.device) + + image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: + """ + Online eagle model trainer, modified from: https://github.com/SafeAILab/EAGLE/blob/main/eagle/traineagle3/cnets.py#L711 + + Args: + input_ids: (batch, seq_len) + attention_mask: (batch, seq_len) + loss_mask: (batch, seq_len) + past_key_values: We dont use this past_key_values in eagle3, but keep it for compatibility. We control kvcache by cache_hidden. + position_ids: (batch, seq_len) + pixel_values: batch image pixel values, used for VLM models + image_grid_thw: (batch, 3), image grid thw, used for VLM models + """ + # Step 0: prepare data with the target model + hidden_states, target, loss_mask, input_ids = self._prepare_data( + input_ids, attention_mask, loss_mask, pixel_values, image_grid_thw + ) + + # Step 1: handle vocab size + target_p_padded, position_mask = _compute_target_p_padded( + target=target, + t2d=self.draft_model.t2d, + loss_mask=loss_mask, + length=self.length, + ) + del target + + # basic info + batch_size, seq_length, _ = hidden_states.shape + seq_length_with_past = seq_length + past_key_values_length = 0 + + # Step 2: project the concatenated hidden states to the target hidden size + hidden_states = self.draft_model.project_hidden_states(hidden_states) + + # Step 3: process kv cache, position ids and position ids + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + attention_mask_tensor = ( + attention_mask + if not isinstance(attention_mask, dict) + else attention_mask["full_attention"] + ) + if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4: + attention_mask_tensor = torch.diagonal( + attention_mask_tensor[:, 0], dim1=1, dim2=2 + ) + attention_mask_tensor = ( + attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min + ) + attention_mask_tensor = (1.0 - attention_mask_tensor).int() + + position_ids, rope_deltas = self.target_model.model.get_rope_index( + input_ids, + image_grid_thw, + None, + second_per_grid_ts=None, + attention_mask=attention_mask_tensor, + ) + self.rope_deltas = rope_deltas + else: + position_ids = position_ids + + # Step 4: handle attention mask + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), + dtype=torch.bool, + device=hidden_states.device, + ) + if self.attention_backend == "sdpa": + attention_mask = self.draft_model.prepare_decoder_attention_mask( + attention_mask=attention_mask, + hidden_states=hidden_states, + batch_size=batch_size, + seq_length=seq_length, + past_key_values_length=past_key_values_length, + ) + + # Step 5: run TTT + plosses = [] + vlosses = [] + acces = [] + if self.attention_backend in ["sdpa", "fa"]: + cache_hidden = [[], []] + past_key_values = None + elif self.attention_backend == "flex_attention": + cache_hidden = None + past_key_values = DynamicCache() + else: + raise ValueError(f"Unknown attention backend: {self.attention_backend}") + + for idx in range(self.length): + target_p = target_p_padded[:, idx : idx + seq_length, :].contiguous() + is_last = idx == self.length - 1 + + # Step 5.1: embed the input ids + # inputs_embeds = self._get_input_embeds(input_ids, pixel_values, image_grid_thw) + inputs_embeds = self.draft_model.embed_input_ids(input_ids) + inputs_embeds = inputs_embeds.to(hidden_states.dtype) + + # Step 5.2: run the draft model backbone + hidden_states_out = self.draft_model.backbone( + input_embeds=inputs_embeds, + hidden_states=hidden_states, + cache_hidden=cache_hidden, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=True, + ) + + # update hidden states for next step + hidden_states = hidden_states_out + + # Step 5.4: get logits + logits = self.draft_model.compute_logits(hidden_states) + + # Step 5.5: record metrics first as we in-place modify logits + with torch.no_grad(): + acces.append( + _compute_metric_acc( + logits=logits, + target_p=target_p, + position_mask=position_mask, + loss_mask=loss_mask, + ) + ) + + # Step 5.6: calculate loss, in-place modifies logits! + loss = LogSoftmaxLoss.apply(logits, target_p, position_mask) + plosses.append(loss) + + if not is_last: + # Step 5.7: we need to update the loss mask + input_ids = padding(input_ids, left=False) + position_mask = padding(position_mask, left=False) + loss_mask = padding(loss_mask, left=False) + # Flex attention mask shirnking is handled inside attention module + return plosses, vlosses, acces + + +def _compute_target_p_padded(target, t2d, loss_mask, length): + with torch.no_grad(): + target_p, position_mask = _compute_target_p( + target=target, + t2d=t2d, + loss_mask=loss_mask, + ) + + assert len(target_p.shape) == 3 + target_p_padded = F.pad( + target_p, + pad=(0, 0, 0, length), + mode="constant", + # For bitwise equality with previous code + value=1 / target_p.shape[-1], + ) + + return target_p_padded, position_mask + + +@torch.compile(dynamic=None) +def _compute_target_p(target, t2d, loss_mask): + target_head = target + target_max_token = target_head.argmax(-1) + target_mask = t2d[target_max_token] + target_mask = target_mask[..., None].int() + position_mask = target_mask * loss_mask + target_head = target_head[..., t2d] + target_head = target_head.float() + target_p = nn.Softmax(dim=2)(target_head) + target_p = target_p.detach() + return target_p, position_mask + + +@torch.compile(dynamic=None) +def _compute_metric_acc(logits, target_p, position_mask, loss_mask): + return ( + (logits.argmax(-1) == target_p.argmax(-1)) * position_mask.squeeze(-1) + ).sum() / loss_mask.sum().clamp_min(1e-6) diff --git a/idea1/specforge/core/eagle3_adapters.py b/idea1/specforge/core/eagle3_adapters.py new file mode 100644 index 0000000000000000000000000000000000000000..555c16efc3b6b894caaa4a560456533a87f3be2e --- /dev/null +++ b/idea1/specforge/core/eagle3_adapters.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Tuple + +import torch +import torch.distributed as dist +import torch.distributed.nn.functional as dist_nn + +from specforge.distributed import get_draft_sp_group, get_sp_ulysses_group + + +@dataclass +class StepState: + input_ids: torch.Tensor + hidden_states: torch.Tensor + position_ids: torch.Tensor + attention_mask: torch.Tensor + target_p: torch.Tensor + position_mask: torch.Tensor + loss_mask: torch.Tensor + + +class BackendAdapter: + def __init__(self, model: "OnlineEagle3Model"): + self.m = model + + def step_view( + self, + *, + idx: int, + ttt_length: int, + global_input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + target_p_padded: torch.Tensor, + position_mask: torch.Tensor, + seq_length: int, + ) -> StepState: + raise NotImplementedError + + def reduce_metrics( + self, *, local_correct: torch.Tensor, local_denom: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + return local_correct, local_denom + + def reduce_loss(self, loss: torch.Tensor) -> torch.Tensor: + return loss + + +class SdpaLikeAdapter(BackendAdapter): + def step_view( + self, + *, + idx: int, + ttt_length: int, + global_input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + target_p_padded: torch.Tensor, + position_mask: torch.Tensor, + seq_length: int, + ) -> StepState: + target_p = target_p_padded[:, idx : idx + seq_length, :].contiguous() + return StepState( + input_ids=global_input_ids, + hidden_states=hidden_states, + position_ids=position_ids, + attention_mask=attention_mask, + target_p=target_p, + position_mask=position_mask, + loss_mask=loss_mask, + ) + + +class UspAdapter(BackendAdapter): + def __init__(self, model: "OnlineEagle3Model"): + super().__init__(model) + self.sp_group = get_draft_sp_group() + self.sp_world_size = dist.get_world_size(self.sp_group) + self.ulysses_pg = get_sp_ulysses_group() + self.sp_ulysses_degree = dist.get_world_size(self.ulysses_pg) + + def step_view( + self, + *, + idx: int, + ttt_length: int, + global_input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + target_p_padded: torch.Tensor, + position_mask: torch.Tensor, + seq_length: int, + ) -> StepState: + usp_chunk_size = seq_length - ttt_length + if usp_chunk_size <= 0: + raise ValueError( + f"USP local seq_length ({seq_length}) must be larger than " + f"ttt_length ({ttt_length})" + ) + target_p = target_p_padded[:, idx : idx + usp_chunk_size, :] + return StepState( + input_ids=global_input_ids[:, :usp_chunk_size], + hidden_states=hidden_states[:, :usp_chunk_size, :], + position_ids=position_ids[:, : usp_chunk_size * self.sp_ulysses_degree], + attention_mask=attention_mask[:, :usp_chunk_size], + target_p=target_p, + position_mask=position_mask[:, :usp_chunk_size, :], + loss_mask=loss_mask[:, :usp_chunk_size, :], + ) + + def reduce_metrics( + self, *, local_correct: torch.Tensor, local_denom: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + local_correct = dist_nn.all_reduce( + local_correct, op=dist.ReduceOp.SUM, group=self.sp_group + ) + local_denom = dist_nn.all_reduce( + local_denom, op=dist.ReduceOp.SUM, group=self.sp_group + ) + return local_correct, local_denom + + def reduce_loss(self, loss: torch.Tensor) -> torch.Tensor: + loss = dist_nn.all_reduce(loss, op=dist.ReduceOp.SUM, group=self.sp_group) + loss = loss / self.sp_world_size + return loss diff --git a/idea1/specforge/core/loss.py b/idea1/specforge/core/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..30e7fba7dd49cce3706ec9c740d9e597f0eade47 --- /dev/null +++ b/idea1/specforge/core/loss.py @@ -0,0 +1,244 @@ +""" +This file incorporates code from Unsloth licensed under the Apache License, Version 2.0. +See the original Unsloth repository at https://github.com/unslothai/unsloth. +The idea of in-place backward pass is from Liger-Kernel. +See the original Liger-Kernel repository at https://github.com/linkedin/Liger-Kernel. +""" + +import torch +import torch.nn as nn +import triton +import triton.language as tl + + +# Reference implementation +@torch.compile(dynamic=None) +def _compute_loss(logits, target_p, position_mask): + logits = logits.float() + out_logp = nn.LogSoftmax(dim=2)(logits) + plogp = target_p * out_logp + loss = -torch.sum(position_mask * plogp, 2).mean() + return loss + + +def _calculate_settings(n): + # reference: https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/utils.py#L43 + + MAX_FUSED_SIZE = 131072 + BLOCK_SIZE = triton.next_power_of_2(n) + if BLOCK_SIZE > MAX_FUSED_SIZE: + raise RuntimeError( + f"Cannot launch Triton kernel since n = {n} exceeds the recommended Triton blocksize = {MAX_FUSED_SIZE}." + ) + + num_warps = 4 + if BLOCK_SIZE >= 32768: + num_warps = 32 + elif BLOCK_SIZE >= 8192: + num_warps = 16 + elif BLOCK_SIZE >= 2048: + num_warps = 8 + + # AMD GPU (ROCm) + if hasattr(torch.version, "hip") and torch.version.hip is not None: + num_warps //= 2 + + return BLOCK_SIZE, num_warps + + +@triton.jit +def log_softmax_forward_kernel( + logits_ptr, + logits_stride, + target_ptr, + target_stride, + position_mask_ptr, + position_mask_stride, + loss_ptr, + loss_stride, + m_ptr, + d_ptr, + n_cols, + BLOCK_SIZE: tl.constexpr, +): + program_id = tl.program_id(0).to(tl.int64) + logits_ptr += program_id * logits_stride + target_ptr += program_id * target_stride + position_mask_ptr += program_id * position_mask_stride + position_mask = tl.load(position_mask_ptr) + if position_mask == 0: + return + + m = float("-inf") + d = 0.0 + + for i in range(0, n_cols, BLOCK_SIZE): + offsets = i + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_cols + logits_block = tl.load( + logits_ptr + offsets, mask=mask, other=float("-inf") + ).cast(tl.float32) + block_max = tl.max(tl.where(mask, logits_block, float("-inf"))) + m_new = tl.maximum(m, block_max) + d = d * tl.exp(m - m_new) + tl.sum( + tl.where(mask, tl.exp(logits_block - m_new), 0.0) + ) + m = m_new + + loss = 0.0 + for i in range(0, n_cols, BLOCK_SIZE): + offsets = i + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_cols + logits_block = tl.load(logits_ptr + offsets, mask=mask, other=0.0).cast( + tl.float32 + ) + target_block = tl.load(target_ptr + offsets, mask=mask, other=0.0).cast( + tl.float32 + ) + # log-softmax: log(exp(x - max) / sum) = (x - max) - log(sum) + normalized_logits = logits_block - m + log_normalizer = tl.log(d) + log_softmax_logits = normalized_logits - log_normalizer + weighted_log_prob = target_block * log_softmax_logits + loss += tl.sum(tl.where(mask, weighted_log_prob, 0.0)) + + loss_ptr += program_id * loss_stride + m_ptr += program_id + d_ptr += program_id + tl.store(loss_ptr, -loss) + tl.store(m_ptr, m.to(tl.float32)) + tl.store(d_ptr, d.to(tl.float32)) + + +@triton.jit +def log_softmax_backward_kernel( + logits_ptr, + logits_stride, + target_ptr, + target_stride, + position_mask_ptr, + grad_output_ptr, + scaling_factor, + m_ptr, + d_ptr, + n_cols, + BLOCK_SIZE: tl.constexpr, +): + program_id = tl.program_id(0).to(tl.int64) + logits_ptr += program_id * logits_stride + target_ptr += program_id * target_stride + position_mask_ptr += program_id + + position_mask = tl.load(position_mask_ptr) + if position_mask == 0: + for i in range(0, n_cols, BLOCK_SIZE): + offsets = i + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_cols + tl.store(logits_ptr + offsets, 0.0, mask=mask) + return + + m_ptr += program_id + d_ptr += program_id + m = tl.load(m_ptr).to(tl.float32) + d = tl.load(d_ptr).to(tl.float32) + grad_output = tl.load(grad_output_ptr).to(tl.float32) + grad_output = grad_output * scaling_factor + + # First pass: compute sum of (target * grad_output) + target_grad_sum = 0.0 + for i in range(0, n_cols, BLOCK_SIZE): + offsets = i + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_cols + target_block = tl.load(target_ptr + offsets, mask=mask, other=0.0).cast( + tl.float32 + ) + target_grad_sum += tl.sum(tl.where(mask, target_block * grad_output, 0.0)) + + # Second pass: compute log-softmax gradients + for i in range(0, n_cols, BLOCK_SIZE): + offsets = i + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_cols + logits_block = tl.load(logits_ptr + offsets, mask=mask, other=0.0).cast( + tl.float32 + ) + target_block = tl.load(target_ptr + offsets, mask=mask, other=0.0).cast( + tl.float32 + ) + softmax_prob = tl.exp(logits_block - m) / d + normalized_grad = softmax_prob * target_grad_sum + grad_block = -(target_block * grad_output - normalized_grad) + tl.store(logits_ptr + offsets, grad_block.to(tl.float32), mask=mask) + + +class LogSoftmaxLoss(torch.autograd.Function): + @staticmethod + def forward(ctx, logits, target, position_mask): + B, T, V = logits.shape + loss = torch.zeros((B * T, 1), device=logits.device) + logits_flat = logits.contiguous().view(B * T, V) + target_flat = target.contiguous().view(B * T, V) + position_mask_flat = position_mask.contiguous().view(B * T, 1).bool() + grid = (B * T,) + m = torch.zeros((B * T,), device=logits.device, dtype=torch.float32) + d = torch.zeros((B * T,), device=logits.device, dtype=torch.float32) + BLOCK_SIZE, num_warps = _calculate_settings(V) + log_softmax_forward_kernel[grid]( + logits_flat, + logits_flat.stride(0), + target_flat, + target_flat.stride(0), + position_mask_flat, + position_mask_flat.stride(0), + loss, + loss.stride(0), + m, + d, + V, + BLOCK_SIZE=BLOCK_SIZE, + num_warps=num_warps, + ) + ctx.save_for_backward(logits.detach(), target, position_mask, m, d) + return loss.squeeze(1).mean() + + @staticmethod + def backward(ctx, grad_output): + logits, target, position_mask, m, d = ctx.saved_tensors + B, T, V = logits.shape + scaling_factor = 1.0 / (B * T) + logits = logits.contiguous().view(B * T, V) + target = target.contiguous().view(B * T, V) + position_mask = position_mask.contiguous().view(B * T, 1).bool() + grid = (B * T,) + BLOCK_SIZE, num_warps = _calculate_settings(V) + log_softmax_backward_kernel[grid]( + logits, + logits.stride(0), + target, + target.stride(0), + position_mask, + grad_output, + scaling_factor, + m, + d, + V, + BLOCK_SIZE=BLOCK_SIZE, + num_warps=num_warps, + ) + logits = logits.view(B, T, V) + return logits, None, None, None, None + + +if __name__ == "__main__": + device = "cuda" + B, T, V = 1, 1024, 16000 + logits = torch.randn(B, T, V, device=device, requires_grad=True) + logits2 = logits.clone().detach().requires_grad_(True) + target = torch.randn(B, T, V, device=device) + position_mask = torch.randint(0, 2, (B, T, 1), dtype=torch.bool, device=device) + position_mask = torch.ones((B, T, 1), dtype=torch.bool, device=device) + output1 = LogSoftmaxLoss.apply(logits, target, position_mask) + output2 = _compute_loss(logits2, target, position_mask) + torch.testing.assert_close(output1, output2, rtol=1e-4, atol=1e-4) + output1.backward() + output2.backward() + torch.testing.assert_close(logits.grad, logits2.grad, rtol=1e-4, atol=1e-4) diff --git a/idea1/specforge/data/__init__.py b/idea1/specforge/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2d18e5099e18d55c4d96a378787592e9443dd25d --- /dev/null +++ b/idea1/specforge/data/__init__.py @@ -0,0 +1,17 @@ +from .preprocessing import ( + build_eagle3_dataset, + build_offline_eagle3_dataset, + generate_vocab_mapping_file, + preprocess_conversations, +) +from .template import ChatTemplate +from .utils import prepare_dp_dataloaders + +__all__ = [ + "build_eagle3_dataset", + "build_offline_eagle3_dataset", + "generate_vocab_mapping_file", + "preprocess_conversations", + "prepare_dp_dataloaders", + "ChatTemplate", +] diff --git a/idea1/specforge/data/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/data/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c51f79c01c58ed6161c9bdc726b4541e892efa5b Binary files /dev/null and b/idea1/specforge/data/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/data/__pycache__/parse.cpython-311.pyc b/idea1/specforge/data/__pycache__/parse.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06d17455a7646b8c7b1c30681a6d6701f409b38a Binary files /dev/null and b/idea1/specforge/data/__pycache__/parse.cpython-311.pyc differ diff --git a/idea1/specforge/data/__pycache__/preprocessing.cpython-311.pyc b/idea1/specforge/data/__pycache__/preprocessing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf5929d29d64451195bfe837935a7b6afe3beb68 Binary files /dev/null and b/idea1/specforge/data/__pycache__/preprocessing.cpython-311.pyc differ diff --git a/idea1/specforge/data/__pycache__/template.cpython-311.pyc b/idea1/specforge/data/__pycache__/template.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a674374281a76c2f8cf598cdef650286e94758d Binary files /dev/null and b/idea1/specforge/data/__pycache__/template.cpython-311.pyc differ diff --git a/idea1/specforge/data/__pycache__/utils.cpython-311.pyc b/idea1/specforge/data/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a95896c5dc10c9b8761f5da758c972aa4f2603de Binary files /dev/null and b/idea1/specforge/data/__pycache__/utils.cpython-311.pyc differ diff --git a/idea1/specforge/data/parse.py b/idea1/specforge/data/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..2b7526cb92d52bfc88a64b9f2860529fabd3f70a --- /dev/null +++ b/idea1/specforge/data/parse.py @@ -0,0 +1,453 @@ +import json +import re +import warnings +from abc import ABC, abstractmethod +from typing import Dict, List, Tuple + +import torch +from transformers import PreTrainedTokenizer + +from .template import ChatTemplate + +__all__ = ["GeneralParser", "HarmonyParser", "ThinkingParser"] + + +class Parser(ABC): + + def __init__( + self, + tokenizer: PreTrainedTokenizer, + chat_template: ChatTemplate, + ): + self.tokenizer = tokenizer + self.chat_template = chat_template + self.standard_keys = {"role", "content", "tool_calls"} + + @abstractmethod + def parse( + self, conversation: "Conversation", max_length: int + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Parse the conversation into a list of tensors. + + Args: + conversation: The conversation to parse. + + Returns: + A list of tensors: [input_ids, loss_mask] + """ + + def _sanitize_message(self, message: dict) -> dict: + """ + Clean up individual messages, handling the following issues: + 1. `tool_calls` is a string → Parse as a list + 2. `tool_calls[].function.arguments` is a string → Parse as a dictionary + 3. Non-standard fields (extra, etc.) in `tool_calls[]` → Remove + """ + cleaned = {k: v for k, v in message.items() if k in self.standard_keys} + + # ===== handle tool_calls ===== + if "tool_calls" in cleaned: + tool_calls = cleaned["tool_calls"] + + # tool_calls is a string → Parsing + if isinstance(tool_calls, str): + try: + tool_calls = json.loads(tool_calls) + except json.JSONDecodeError: + warnings.warn( + f"Failed to parse tool_calls JSON string, removing tool_calls" + ) + cleaned.pop("tool_calls", None) + return cleaned + + # Clean each tool_call + if isinstance(tool_calls, list): + sanitized_tool_calls = [] + + for tc in tool_calls: + if not isinstance(tc, dict): + continue + + # Only retain the standard fields: id, type, function + clean_tc = { + "id": tc.get("id", ""), + "type": tc.get("type", "function"), + } + + # handle function + func = tc.get("function", {}) + if isinstance(func, dict): + clean_func = { + "name": func.get("name", ""), + } + + arguments = func.get("arguments", {}) + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + warnings.warn( + f"Failed to parse arguments for tool '{clean_func['name']}': " + f"{arguments[:100]}..." + ) + arguments = {} + + clean_func["arguments"] = arguments + clean_tc["function"] = clean_func + + sanitized_tool_calls.append(clean_tc) + + cleaned["tool_calls"] = sanitized_tool_calls + + return cleaned + + +_harmony_encoding = None + + +class GeneralParser(Parser): + + def __init__( + self, + tokenizer: PreTrainedTokenizer, + chat_template: ChatTemplate, + ): + super().__init__(tokenizer, chat_template) + self.system_prompt = chat_template.system_prompt + self.user_message_separator = f"{chat_template.end_of_turn_token}" + self.assistant_message_separator = f"{chat_template.assistant_header}" + self.set_assistant_pattern(chat_template) + + def apply_chat_template(self, messages, tool, **kwargs) -> str: + conversation = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + tools=tool, + **kwargs, + ) + return conversation + + def set_assistant_pattern(self, chat_template: ChatTemplate): + if chat_template.assistant_pattern_type == "longcat": + self.assistant_pattern = ( + re.escape(self.assistant_message_separator) + + r"([\s\S]*?(?:" + + re.escape("[Round ") + + r"\d+" + + re.escape("] USER:") + + "|$))" + ) + else: + self.assistant_pattern = ( + re.escape(self.assistant_message_separator) + + r"([\s\S]*?(?:" + + re.escape(self.chat_template.end_of_turn_token) + + "|$))" + ) + + def parse( + self, + conversation: "Conversation", + max_length: int, + preformatted: bool = False, + train_only_last_turn: bool = False, + tool: List[Dict] = [], + **kwargs, + ) -> Dict[str, List[torch.Tensor]]: + if not preformatted: + messages = [] + + if conversation[0]["role"] == "system": + warnings.warn( + f"The first message is from system, we will use the system prompt from the data and ignore the system prompt from the template" + ) + messages.append( + {"role": "system", "content": conversation[0]["content"]} + ) + conversation = conversation[1:] + else: + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + + for j, sentence in enumerate(conversation): + role = sentence["role"] + if j == 0: + if role != "user": + warnings.warn( + f"Conversation must start with a 'user' role, but found '{role}'. Conversation truncated." + ) + break + else: + prev_role = conversation[j - 1]["role"] + if role == "tool" and prev_role not in ["assistant", "tool"]: + warnings.warn( + f"A 'tool' message must follow an 'assistant' or 'tool' message, but was preceded by '{prev_role}'. Conversation truncated." + ) + break + if role == "assistant" and prev_role not in ["user", "tool"]: + warnings.warn( + f"An 'assistant' message must follow a 'user' or 'tool' message, but was preceded by '{prev_role}'. Conversation truncated." + ) + break + sentence = self._sanitize_message(sentence) + messages.append(sentence) + try: + conversation = self.apply_chat_template(messages, tool=tool, **kwargs) + except (ValueError, TypeError): + # Fallback rendering for tokenizers without built-in chat_template + warnings.warn( + "Tokenizer does not have a chat_template, using fallback rendering." + ) + parts = [] + bos_token = getattr(self.tokenizer, "bos_token", None) + user_header = self.chat_template.user_header or "" + assistant_header = self.chat_template.assistant_header or "" + end_of_turn = self.chat_template.end_of_turn_token or "" + + # Add BOS token at the start + if bos_token: + parts.append(bos_token) + + for msg in messages: + if msg["role"] == "system": + parts.append(msg["content"]) + elif msg["role"] == "user": + parts.append(f"{user_header}{msg['content']}") + elif msg["role"] == "assistant": + parts.append(f"{assistant_header}{msg['content']}{end_of_turn}") + conversation = "".join(parts) + + if not self.tokenizer.pad_token_id: + self.tokenizer.pad_token_id = self.tokenizer.unk_token_id + + # get input_ids + encoding = self.tokenizer( + conversation, + max_length=max_length, + truncation=True, + return_tensors="pt", + add_special_tokens=False, + ) + input_ids = encoding.input_ids[0] + loss_mask = torch.zeros(len(input_ids), dtype=torch.long) + + matches = list(re.finditer(self.assistant_pattern, conversation, re.DOTALL)) + if train_only_last_turn and matches: + matches = [matches[-1]] # Only keep the last match + + for match in matches: + content_start_char = match.start(1) + content_end_char = match.end(1) + + # --- Core Alternative Operation: Calculate Token Index Based on Prefix String Length --- + # Encode the text "assistant start", the length of which is the position of the starting token. + prefix_ids = self.tokenizer.encode( + conversation[:content_start_char], + add_special_tokens=False, + truncation=True, + max_length=max_length, + ) + # Encodes the text "assistant end", the length of which is the position of the end token. + full_ids = self.tokenizer.encode( + conversation[:content_end_char], + add_special_tokens=False, + truncation=True, + max_length=max_length, + ) + + start_token_idx = len(prefix_ids) + end_token_idx = len(full_ids) + + # Handling out-of-bounds errors caused by truncation + actual_start = min(start_token_idx, len(input_ids)) + actual_end = min(end_token_idx, len(input_ids)) + + if actual_start < actual_end: + loss_mask[actual_start:actual_end] = 1 + + # Zero out loss_mask for ignore_tokens + ignore_tokens = self.chat_template.ignore_token + if ignore_tokens: + for token_str in ignore_tokens: + start = 0 + while True: + idx = conversation.find(token_str, start) + if idx == -1: + break + ignore_start_char = idx + ignore_end_char = idx + len(token_str) + + prefix_ids = self.tokenizer.encode( + conversation[:ignore_start_char], + add_special_tokens=False, + truncation=True, + max_length=max_length, + ) + full_ids = self.tokenizer.encode( + conversation[:ignore_end_char], + add_special_tokens=False, + truncation=True, + max_length=max_length, + ) + + start_token_idx = min(len(prefix_ids), len(input_ids)) + end_token_idx = min(len(full_ids), len(input_ids)) + + if start_token_idx < end_token_idx: + loss_mask[start_token_idx:end_token_idx] = 0 + + start = ignore_end_char + + return input_ids, loss_mask + + +class HarmonyParser(Parser): + def __init__(self, tokenizer: PreTrainedTokenizer, chat_template: ChatTemplate): + super().__init__(tokenizer, chat_template) + self.reasoning_levels = ["low", "medium", "high"] + self.default_reasoning_level = "low" + + def build_single_turn_prompt( + self, + prompt_text: str, + role: str, + content: str, + ) -> str: + """Embed user message into the required prompt template.""" + if role == "system": + prompt_text = f"<|start|>system<|message|>{content}<|end|>" + elif role == "assistant_reasoning_effort": + prompt_text = f"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-06-28\n\nReasoning: {content.lower()}\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|>" + elif role == "user": + prompt_text += f"<|start|>user<|message|>{content}<|end|>" + elif role == "assistant_analysis": + prompt_text += ( + f"<|start|>assistant<|channel|>analysis<|message|>{content}<|end|>" + ) + elif role == "assistant_commentary": + prompt_text += ( + f"<|start|>assistant<|channel|>commentary<|message|>{content}<|end|>" + ) + elif role == "assistant_final": + prompt_text += ( + f"<|start|>assistant<|channel|>final<|message|>{content}<|end|>" + ) + else: + raise ValueError(f"Unknown role: {role}") + return prompt_text + + def parse( + self, + conversation: "Conversation", + max_length: int, + preformatted: bool = False, + train_only_last_turn: bool = False, + tool: List[Dict] = [], + ) -> List[torch.Tensor]: + # conversation = process_harmony_conversations(conversation) + if not preformatted: + prompt_text = "" + for j, message in enumerate(conversation): + if j == 0 and ( + message["role"] != "system" + or message["role"] != "assistant_reasoning_effort" + ): + prompt_text = self.build_single_turn_prompt( + prompt_text, + "assistant_reasoning_effort", + self.default_reasoning_level, + ) + prompt_text = self.build_single_turn_prompt( + prompt_text, message["role"], message["content"] + ) + conversation = prompt_text + + if not self.tokenizer.pad_token_id: + self.tokenizer.pad_token_id = self.tokenizer.unk_token_id + + encoding = self.tokenizer( + conversation, + return_offsets_mapping=True, + max_length=max_length, + truncation=True, + return_tensors="pt", + add_special_tokens=False, + ) + input_ids = encoding.input_ids[0] + offsets = encoding.offset_mapping[0] + loss_mask = torch.zeros(len(input_ids), dtype=torch.long) + + # Find spans of assistant responses using regex + # We match `<|start|>assistant` and only extract the content following it. + # This continues until `<|start|>user<|message|>` appears, or until the end of the string. + pattern = re.compile( + r"<\|start\|>assistant([\s\S]*?)(?=<\|start\|>user<\|message\|>|$)" + ) + + # Find all matching segments + matches = list(pattern.finditer(conversation)) + if train_only_last_turn and matches: + matches = [matches[-1]] # Only keep the last match + + for match in matches: + # match.start(0) is the start index of the full match (including `<|start|>assistant`) + # match.start(1) is the start index of the first capture group (excluding `<|start|>assistant`) + # match.end(1) is the end index of the content + start_char = match.start(1) + end_char = match.end(1) + + # Map character indices to token indices + for idx, (ts, te) in enumerate(offsets): + # Set mask to 1 only if the token's character range falls entirely within the "content area" + if ts >= start_char and te <= end_char: + loss_mask[idx] = 1 + + return input_ids, loss_mask + + +class ThinkingParser(GeneralParser): + """ + Parser for thinking/reasoning models. + + This parser processes the entire conversation (not just the last turn). + It handles reasoning_content and tool_calls in assistant messages. + The loss mask covers from assistant_header to end_of_turn_token (inclusive). + """ + + def __init__( + self, + tokenizer: PreTrainedTokenizer, + chat_template: ChatTemplate, + ): + super().__init__(tokenizer, chat_template) + self.standard_keys = {"role", "content", "tool_calls", "reasoning_content"} + + def apply_chat_template(self, messages, tool, **kwargs) -> str: + """Apply chat template to all messages, handling reasoning_content and tool_calls.""" + conversation = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + tools=tool, + **kwargs, + ) + return conversation + + def parse( + self, + conversation: "Conversation", + max_length: int, + preformatted: bool = False, + train_only_last_turn: bool = False, + tool: List[Dict] = [], + **kwargs, + ) -> Dict[str, List[torch.Tensor]]: + """Parse conversation, processing all assistant turns for loss mask.""" + if self.chat_template.enable_thinking: + kwargs["enable_thinking"] = True + return super().parse( + conversation, max_length, preformatted, train_only_last_turn, tool, **kwargs + ) diff --git a/idea1/specforge/data/preprocessing.py b/idea1/specforge/data/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..d5af9479e5b2c4707a4b2724be176942cb2ac488 --- /dev/null +++ b/idea1/specforge/data/preprocessing.py @@ -0,0 +1,784 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in HuggingFace Transformers. +# Portions of this code are adapted from: +# - https://github.com/EleutherAI/gpt-neox (Apache License 2.0) +# - https://github.com/huggingface/transformers (Apache License 2.0) +# - https://github.com/SafeAILab/EAGLE (Apache License 2.0) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gzip +import io +import json +import os +import re +import warnings +from collections import Counter +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from tqdm import tqdm +from transformers import ImageProcessingMixin, PreTrainedTokenizer + +from datasets import Dataset as HFDataset + +from ..distributed import get_draft_sp_group, get_sp_ring_group + +try: + from qwen_vl_utils import process_vision_info + + HAS_QWEN_VL_UTILS = True +except ImportError: + HAS_QWEN_VL_UTILS = False + process_vision_info = None + + +from .parse import GeneralParser, HarmonyParser, ThinkingParser +from .template import TEMPLATE_REGISTRY, ChatTemplate + +# define a type called conversation +Conversation = List[Dict[str, str]] + + +# ============================== +# This file is for preprocessing the data +# ============================== + + +def _apply_loss_mask_from_chat_template( + text: str, + offsets: torch.Tensor, + chat_template: ChatTemplate, +) -> torch.Tensor: + """ + Apply loss mask to identify assistant response spans using chat template. + + Args: + text: The formatted conversation text. + offsets: Token offset mapping from tokenizer. + chat_template: The chat template to use for identifying assistant spans. + + Returns: + A tensor indicating which tokens should contribute to the loss (1) or not (0). + """ + loss_mask = torch.zeros(len(offsets), dtype=torch.long) + + user_message_separator = ( + f"{chat_template.end_of_turn_token}{chat_template.user_header}" + ) + assistant_message_separator = ( + f"{chat_template.end_of_turn_token}{chat_template.assistant_header}" + ) + + # Find spans of assistant responses using regex + assistant_pattern = ( + re.escape(assistant_message_separator) + + r"(.*?)(?=" + + re.escape(user_message_separator) + + "|$)" + ) + + matches_found = 0 + + for match in re.finditer(assistant_pattern, text, re.DOTALL): + matches_found += 1 + # Assistant response text span (excluding assistant_header itself) + assistant_start_char = match.start(1) + assistant_end_char = match.end(1) + + # Mark tokens overlapping with assistant response + for idx, (token_start, token_end) in enumerate(offsets): + # Token is part of the assistant response span + if token_end <= assistant_start_char: + continue # token before assistant text + if token_start > assistant_end_char: + continue # token after assistant text + loss_mask[idx] = 1 + + if matches_found == 0: + print("WARNING: No assistant response spans found in the conversation text.") + + return loss_mask + + +# Copied from https://github.com/SafeAILab/EAGLE/blob/main/eagle/traineagle3/cnets.py +def preprocess_conversations( + tokenizer: PreTrainedTokenizer, + conversations: Union[List[Conversation], List[str]], + chat_template: ChatTemplate, + max_length: int = 2048, + is_preformatted: bool = False, + train_only_last_turn: bool = False, + tools: Optional[List[List[Dict]]] = [[]], + **kwargs, +) -> Dict[str, List[torch.Tensor]]: + """ + Preprocess a batch of ShareGPT style conversations or pre-formatted text. + + Args: + tokenizer: The tokenizer to use for tokenization. + conversations: A list of conversations (if is_preformatted=False) or + a list of pre-formatted text strings (if is_preformatted=True). + chat_template: The chat template to use for formatting/identifying spans. + max_length: The maximum length of the tokenized input. + is_preformatted: Whether the input is already formatted text strings. + train_only_last_turn: If True, only the last assistant turn contributes to the loss. + tools: Optional list of tools information corresponding to each conversation, used for tool-use conversations. + + Returns: + A dictionary containing: + - input_ids: List of tokenized input IDs. + - loss_mask: List of loss masks indicating which tokens should contribute to the loss. + - attention_mask: List of attention masks. + """ + # prepare result + results = {"input_ids": [], "loss_mask": [], "attention_mask": []} + if chat_template.parser_type == "general": + parser = GeneralParser(tokenizer, chat_template) + elif chat_template.parser_type == "thinking": + parser = ThinkingParser(tokenizer, chat_template) + elif chat_template.parser_type == "openai-harmony": + parser = HarmonyParser(tokenizer, chat_template) + else: + raise ValueError(f"Invalid parser type: {chat_template.parser_type}") + kwargs_list = [{} for _ in range(len(conversations))] + for key, value_list in kwargs.items(): + for i, value in enumerate(value_list): + kwargs_list[i][key] = value + for source, tool, kwargs_item in zip(conversations, tools, kwargs_list): + if not source: + # if the source is None, skip it + continue + input_ids, loss_mask = parser.parse( + source, + max_length, + preformatted=is_preformatted, + train_only_last_turn=train_only_last_turn, + tool=tool, + **kwargs_item, + ) + results["input_ids"].append(input_ids[None, :]) + results["loss_mask"].append(loss_mask[None, :]) + results["attention_mask"].append(torch.ones_like(loss_mask)[None, :]) + return results + + +def preprocess_vlm_conversations( + processor: ImageProcessingMixin, + examples: List[Conversation], + chat_template: ChatTemplate, + max_length: int = 2048, +) -> Dict[str, List[torch.Tensor]]: + """ + Preprocess a batch of ShareGPT style conversations. + + Args: + processor: The image processor to use for processing images. + examples: A list of examples, where each example is a dictionary containing: + - image: The image in the conversation. + - conversations: A list of conversations, where each conversation is a list of messages. + chat_template: The chat template to use for formatting the conversations. + max_length: The maximum length of the tokenized input. + + Returns: + A dictionary containing: + - input_ids: List of tokenized input IDs. + - loss_mask: List of loss masks indicating which tokens should contribute to the loss. + - attention_mask: List of attention masks. + - pixel_values: List of pixel values for images in the examples. + - image_grid_thw: List of image grid tensors. + """ + system_prompt = chat_template.system_prompt + + # prepare result + results = { + "input_ids": [], + "loss_mask": [], + "attention_mask": [], + "pixel_values": [], + "image_grid_thw": [], + } + + # Note: currently, we assume that each example has only one image + for i, image in enumerate(examples["image"]): + source = examples["conversations"][i] + messages = [{"role": "system", "content": system_prompt}] + if not source: + # if the source is None, skip it + continue + + if source[0]["role"] != "user": + # if the first message is not from user, skip it + source = source[1:] + + convroles = ["user", "assistant"] + for j, sentence in enumerate(source): + role = sentence["role"] + assert role == convroles[j % 2], f"unexpected role {role}" + if role == "user": + # if the message is from user and has image, process the image + messages.append( + { + "role": role, + "content": [ + { + "type": "image", + "image": image, + }, + {"type": "text", "text": sentence["content"]}, + ], + } + ) + else: + messages.append({"role": role, "content": sentence["content"]}) + + conversation = processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + ) + # get vision infor use qwen_vl_utils + if not HAS_QWEN_VL_UTILS: + raise ImportError( + "qwen_vl_utils is required for VLM preprocessing but is not installed. " + "Please install it to use VLM features." + ) + image_inputs, video_inputs = process_vision_info(messages) + assert image_inputs is not None, "image_inputs must not be None" + + encoding = processor( + text=[conversation], + images=image_inputs, + videos=video_inputs, + max_length=max_length, + truncation=True, + return_tensors="pt", + return_offsets_mapping=True, + add_special_tokens=False, + ) + input_ids = encoding.input_ids[0] + offsets = encoding.offset_mapping[0] + pixel_values = encoding.pixel_values + image_grid_thw = encoding.image_grid_thw[0] + + # get conversation with image info for loss mask generation + decoded_conversation = processor.tokenizer.decode( + encoding.input_ids[0], skip_special_tokens=False + ) + + # Apply loss mask + loss_mask = _apply_loss_mask_from_chat_template( + decoded_conversation, offsets, chat_template + ) + + results["input_ids"].append(input_ids[None, :]) + results["loss_mask"].append(loss_mask[None, :]) + results["attention_mask"].append(torch.ones_like(loss_mask)[None, :]) + results["pixel_values"].append(pixel_values) + results["image_grid_thw"].append(image_grid_thw[None, :]) + return results + + +def build_eagle3_dataset( + dataset: HFDataset, + tokenizer: PreTrainedTokenizer, + chat_template: Optional[str] = None, + max_length: Optional[int] = 2048, + shuffle_seed: Optional[int] = 42, + num_proc: Optional[int] = 8, + cache_dir: Optional[str] = None, + cache_key: Optional[str] = None, + is_vlm: Optional[bool] = False, + processor: Optional[ImageProcessingMixin] = None, + is_preformatted: Optional[bool] = False, + train_only_last_turn: Optional[bool] = False, +) -> HFDataset: + """ + build eagle3 dataset + + Args: + dataset: HF dataset to process. + tokenizer: The tokenizer to use for tokenization. + chat_template: The chat template to use for formatting conversations. + This includes the system prompt and user/assistant tokens + required to delineate different parts of the conversation + for loss mask generation. + max_length: The maximum length of the tokenized input. + shuffle_seed: The seed for shuffling the dataset. + num_proc: The number of processes to use for multiprocessing. + cache_dir: The directory to use for caching the processed dataset. + cache_key: The key to use for caching the processed dataset. + is_vlm: Whether the dataset is for VLM models. + processor: The image processor to use for processing images. + is_preformatted: Whether the dataset contains preformatted text of the conversation + (e.g. includes system prompt, user and assistant start and end tokens) + and doesn't need to have the chat template applied. + Note that the chat_template still needs to be specified to determine + the assistant spans for loss mask generation. + If True, expects "text" column with ready-to-train text. + If False, expects "conversations" column with ShareGPT format. + train_only_last_turn: If True, only the last assistant turn contributes to the loss. + Useful for thinking models where history may not contain thoughts. + + Returns: + The processed HF dataset. + """ + if is_vlm: + assert processor is not None, "processor must be provided when is_vlm is True" + + # Validate chat_template requirement + if chat_template is None: + raise ValueError("chat_template must be provided for all dataset types") + + assert ( + chat_template in TEMPLATE_REGISTRY.get_all_template_names() + ), f"Chat template {chat_template} not found in TEMPLATE_REGISTRY, you may need to register it first" + + template: ChatTemplate = TEMPLATE_REGISTRY.get(chat_template) + + dataset = dataset.shuffle(seed=shuffle_seed) + original_cols = dataset.column_names + + def preprocess_function(examples): + # Handle different dataset formats + if is_vlm: + processed = preprocess_vlm_conversations( + processor, + examples, + template, + max_length, + ) + elif is_preformatted: + # Handle pre-formatted text (should be in "text" column) + if "text" not in examples: + raise ValueError( + f"Expected 'text' column for is_preformatted=True, but found columns: {list(examples.keys())}" + ) + processed = preprocess_conversations( + tokenizer, + examples["text"], + template, + max_length, + is_preformatted=True, + train_only_last_turn=train_only_last_turn, + ) + else: + # Handle ShareGPT conversations + if "conversations" not in examples: + raise ValueError( + f"Expected 'conversations' column for is_preformatted=False, but found columns: {list(examples.keys())}" + ) + conversations = examples.pop("conversations") + if "id" in examples: + examples.pop("id") + if "tools" in examples: + tools_raw = examples.pop("tools") + # Parse tools: handle JSON strings from safe_conversations_generator + tools = [] + for tool_item in tools_raw: + if isinstance(tool_item, (str, list)): + try: + tools.append(json.loads(tool_item)) + except json.JSONDecodeError: + warnings.warn( + f"Failed to parse tools JSON string: {tool_item[:100]}..." + ) + tools.append([]) + elif isinstance(tool_item, list): + tools.append(tool_item) + elif tool_item is None: + tools.append([]) + else: + warnings.warn( + f"Unexpected tools type: {type(tool_item)}, using empty list" + ) + tools.append([]) + else: + tools = [[] for _ in range(len(conversations))] + processed = preprocess_conversations( + tokenizer, + conversations, + template, + max_length, + is_preformatted=False, + train_only_last_turn=train_only_last_turn, + tools=tools, + **examples, + ) + + return processed + + # Process dataset only once + if cache_dir and cache_key: + load_from_cache_file = True + os.makedirs(cache_dir, exist_ok=True) + cache_file_name = os.path.join(cache_dir, f"{cache_key}.pkl") + print(f"dataset is cached at {cache_file_name}") + elif cache_dir is None and cache_key is None: + load_from_cache_file = False + cache_file_name = None + print(f"dataset is not cached") + else: + warnings.warn( + f"cache_dir and cache_key must be provided together to make caching work" + ) + + # Disable tokenizers internal parallelism when using multiprocessing to avoid + # deadlocks caused by forked Rust threads (see huggingface/tokenizers#1391). + if num_proc is not None and num_proc > 1: + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # adjust batch size based on dataset type + if is_vlm: + batch_size = ( + 200 # reduce batch size for VLM datasets to avoid PyArrow offset overflow + ) + else: + batch_size = 1000 # default for conversations + dataset = dataset.map( + preprocess_function, + batched=True, + num_proc=num_proc, + batch_size=batch_size, + remove_columns=original_cols, + # keep_in_memory=True, + load_from_cache_file=load_from_cache_file, + cache_file_name=cache_file_name, + ) + + dataset.set_format(type="torch") + return dataset + + +# ============================== +# Offline Eagle3 Dataset +# ============================== +# modified from https://github.com/NickL77/BaldEagle/blob/master/train/modules/data/data.py +def list_local_files(path, suffixes=None): + if suffixes is None: + suffixes = [".ckpt", ".ckpt.gz"] + datapaths = [] + for root, directories, files in os.walk(path): + for file in files: + file_path = os.path.join(root, file) + datapaths.append(file_path) + if suffixes: + datapaths = [ + f_name + for f_name in datapaths + if any(f_name.endswith(suffix) for suffix in suffixes) + ] + datapaths.sort() # Sort to ensure deterministic order across ranks + return datapaths + + +class OfflineEagle3Dataset(torch.utils.data.Dataset): + def __init__( + self, + datapath, + transform=None, + max_len=2048, + ttt_length=1, + use_usp_preprocess=False, + ): + """ + Args: + datapath: List of file paths. + transform: Optional transform to apply. + max_len: Maximum sequence length to load. + ttt_length: TTT overlap length used in USP preprocessing. + use_usp_preprocess: Whether to shard all sequences with USP overlap in preprocessing. + """ + self.datapaths = datapath + self.transform = transform + self._epoch = 0 + self.max_len = max_len + self.ttt_length = ttt_length + self.use_usp_preprocess = use_usp_preprocess + if use_usp_preprocess: + sp_group = get_draft_sp_group() + self.sp_rank = torch.distributed.get_rank(sp_group) + self.sp_size = torch.distributed.get_world_size(sp_group) + ring_group = get_sp_ring_group() + self.ring_rank = torch.distributed.get_rank(ring_group) + self.sp_ring_size = torch.distributed.get_world_size(ring_group) + + @staticmethod + def process_data(data, max_len, transform=None): + new_data = {} + # Squeeze due to our data generation script adding a batch dimension + hidden_state = data["aux_hidden_state"].squeeze(0)[:max_len][None, :] + target = data["hidden_state"].squeeze(0)[:max_len][None, :] + + input_ids = data["input_ids"][:max_len][None, :] + loss_mask = data["loss_mask"][:max_len][None, :] + loss_mask[0, -1] = 0 + + new_data["attention_mask"] = torch.ones_like(loss_mask, dtype=torch.long) + new_data["loss_mask"] = loss_mask + new_data["target"] = target + new_data["hidden_state"] = hidden_state + new_data["input_ids"] = input_ids + if transform: + new_data = transform(new_data) + return new_data + + @staticmethod + def process_data_usp( + data, + max_len, + ttt_length=1, + transform=None, + sp_rank=0, + sp_size=1, + ring_rank=0, + sp_ring_size=1, + ): + """ + USP preprocess: shard all sequences by sp_rank and add TTT overlap. + Each local sequence length = ceil(max_len / sp_size) + ttt_length. + """ + new_data = {} + + input_ids = data["input_ids"] + if input_ids.ndim == 1: + input_ids = input_ids.unsqueeze(0) + + global_len = min(max_len, input_ids.shape[1]) + chunk_size = (global_len + sp_size - 1) // sp_size + start = sp_rank * chunk_size + local_len = chunk_size + ttt_length + + end = min(start + local_len, global_len) + + def _slice_and_pad(tensor): + if tensor.ndim == 1: + tensor = tensor.unsqueeze(0) + tensor = tensor[:, :global_len] + sliced = tensor[:, start : min(end, tensor.shape[1])] + valid_len = sliced.shape[1] + if valid_len < local_len: + pad_len = local_len - valid_len + if tensor.ndim == 2: + sliced = F.pad(sliced, (0, pad_len)) + else: + sliced = F.pad(sliced, (0, 0, 0, pad_len)) + return sliced.contiguous(), valid_len + + if "aux_hidden_state" not in data or data["aux_hidden_state"] is None: + raise KeyError("aux_hidden_state is required for OfflineEagle3Dataset") + new_data["hidden_state"], _ = _slice_and_pad(data["aux_hidden_state"]) + new_data["target"], _ = _slice_and_pad(data["hidden_state"]) + + new_data["input_ids"], valid_len = _slice_and_pad(input_ids) + + full_loss_mask = data["loss_mask"] + if full_loss_mask.ndim == 1: + full_loss_mask = full_loss_mask.unsqueeze(0) + + full_loss_mask = full_loss_mask[:, :global_len].clone() + if full_loss_mask.numel() > 0: + full_loss_mask[0, -1] = 0 + new_data["loss_mask"], _ = _slice_and_pad(full_loss_mask) + + local_len = new_data["input_ids"].shape[1] + attention_mask = torch.zeros((1, local_len), dtype=torch.long) + attention_mask[:, :valid_len] = 1 + new_data["attention_mask"] = attention_mask + + # Position ids should align with Ulysses all2all-expanded sequence length. + # Local seq_len (per sp_rank) = local_len; attention uses (local_len - ttt_length). + sp_ulysses_size = max(1, sp_size // sp_ring_size) + usp_chunk_size = max(local_len - ttt_length, 0) + ring_chunk = usp_chunk_size * sp_ulysses_size + ring_start = ring_rank * ring_chunk + new_data["position_ids"] = torch.arange( + ring_start, ring_start + ring_chunk, dtype=torch.long + ).unsqueeze(0) + + if transform: + new_data = transform(new_data) + + return new_data + + def __len__(self): + return len(self.datapaths) + + def _open_file(self, index): + """ + Opens the file with memory mapping. + This operation is virtually instant and consumes negligible RAM + because no data is actually read from disk yet. + """ + data_path = self.datapaths[index] + if data_path.endswith(".gz"): + with gzip.open(data_path, "rb") as f: + return torch.load(io.BytesIO(f.read()), weights_only=False) + return torch.load(data_path, weights_only=False, mmap=True) + + def __getitem__(self, index): + try: + data = self._open_file(index) + except Exception as e: + print(f"ERROR Failed to load {self.datapaths[index]} with error {e}") + data = self._open_file(0) + + # 2. Read only specific bytes from disk + if self.use_usp_preprocess: + return self.process_data_usp( + data, + self.max_len, + ttt_length=self.ttt_length, + transform=self.transform, + sp_rank=self.sp_rank, + sp_size=self.sp_size, + ring_rank=self.ring_rank, + sp_ring_size=self.sp_ring_size, + ) + return self.process_data( + data, + self.max_len, + self.transform, + ) + + def set_epoch(self, epoch): + self._epoch = epoch + + +def build_offline_eagle3_dataset( + hidden_states_path: str, + max_len: int = 2048, + ttt_length: int = 1, + use_usp_preprocess: bool = False, +) -> torch.utils.data.Dataset: + + return OfflineEagle3Dataset( + list_local_files(hidden_states_path), + max_len=max_len, + ttt_length=ttt_length, + use_usp_preprocess=use_usp_preprocess, + ) + + +# ============================== +# Vocab Mapping +# ============================== +def generate_vocab_mapping_file( + dataset: HFDataset, + target_vocab_size: int, + draft_vocab_size: int, + cache_dir: str = "./cache/vocab_mapping", + cache_key: str = "vocab_mapping", +) -> str: + """ + Generate a vocab mapping file for the dataset. + + Args: + dataset: The dataset to process. + target_vocab_size: The target vocabulary size. + draft_vocab_size: The draft vocabulary size. + cache_dir: The directory to use for caching the vocab mapping file. + cache_key: The key to use for caching the vocab mapping file. + + Returns: + The path to the vocab mapping file. + """ + # prepare cache directory + os.makedirs(cache_dir, exist_ok=True) + vocab_mapping_path = os.path.join(cache_dir, f"{cache_key}.pt") + + if os.path.exists(vocab_mapping_path): + print(f"Loading vocab mapping from the cached file at: {vocab_mapping_path}") + return vocab_mapping_path + + # we first count the frequency of effective tokens in the dataset + token_dict = Counter() + for input_ids, loss_mask in tqdm( + zip(dataset["input_ids"], dataset["loss_mask"]), + total=len(dataset), + desc="Counting tokens for vocab mapping", + ): + masked_ids = input_ids[loss_mask == 1] + unique_ids, counts = masked_ids.unique(return_counts=True) + batch_token_dict = dict(zip(unique_ids.tolist(), counts.tolist())) + token_dict.update(batch_token_dict) + + # generate the d2t and t2d mapping + d2t, t2d = process_token_dict_to_mappings( + token_dict, + draft_vocab_size, + target_vocab_size, + ) + + vocab_mapping = { + "d2t": d2t, + "t2d": t2d, + } + torch.save(vocab_mapping, vocab_mapping_path) + print(f"Saved vocab mapping to: {vocab_mapping_path}") + return vocab_mapping_path + + +def process_token_dict_to_mappings( + token_dict: Counter, + draft_vocab_size: int, + target_vocab_size: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Process token_dict to create d2t and t2d mappings, with optional caching. + + Args: + token_dict: A Counter object mapping token ids to their frequencies. + draft_vocab_size: The size of the draft vocabulary. + target_vocab_size: The size of the target vocabulary. + + Returns: + A tuple containing: + - d2t: A tensor mapping draft token ids to target token ids. + - t2d: A tensor mapping target token ids to draft token ids. + """ + if len(token_dict) < draft_vocab_size: + existing_tokens = set(token_dict.keys()) + missing_tokens = set(range(draft_vocab_size)) - existing_tokens + for token in missing_tokens: + token_dict[token] = 0 + if len(token_dict) >= draft_vocab_size: + break + print(f"Added missing tokens to reach draft vocab size: {draft_vocab_size}") + print(f"Total tokens after addition: {len(token_dict)}") + total_frequency = sum(token_dict.values()) + top_N = token_dict.most_common(draft_vocab_size) + top_N_frequency_sum = sum(freq for key, freq in top_N) + + if total_frequency == 0: + print( + "Warning: Total token frequency is zero. All tokens will have zero ratio." + ) + top_N_ratio = 0.0 + else: + top_N_ratio = top_N_frequency_sum / total_frequency + + print(f"top {draft_vocab_size} token frequency ratio: {top_N_ratio:.2%}") + used_tokens = [key for key, freq in top_N] + used_tokens.sort() + + d2t = [used_tokens[i] - i for i in range(len(used_tokens))] + t2d = [i in used_tokens for i in range(target_vocab_size)] + d2t = torch.tensor(d2t) + t2d = torch.tensor(t2d) + + return d2t, t2d diff --git a/idea1/specforge/data/template.py b/idea1/specforge/data/template.py new file mode 100644 index 0000000000000000000000000000000000000000..4dde000fdae6249e12d571499ca491570012eb36 --- /dev/null +++ b/idea1/specforge/data/template.py @@ -0,0 +1,326 @@ +# Adapted from: https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py#L13 +from typing import List, Optional + +from pydantic import BaseModel + + +class ChatTemplate(BaseModel): + """ + This is a dataclass for the chat template. + + Args: + assistant_header(str): The header for the assistant. + user_header(str): The header for the user. + system_prompt(str): The system prompt. + end_of_turn_token(str): The end token of a turn of conversation. + ignore_token(List[str]): The list of tokens to ignore when parsing the model output, e.g., for thinking token. + """ + + assistant_header: Optional[str] = None + user_header: Optional[str] = None + system_prompt: Optional[str] = None + end_of_turn_token: Optional[str] = None + parser_type: str = "general" + assistant_pattern_type: str = "general" + enable_thinking: bool = False + ignore_token: Optional[List[str]] = None + + +class TemplateRegistry: + """ + This is a registry for the chat template. Sgl-spec will register some common chat templates here. + If you have a custom chat template, you can register it via the example below. + + Example: + ```python + from specforge.data.template import TEMPLATE_REGISTRY, ChatTemplate + TEMPLATE_REGISTRY.register( + name="custom", + template=ChatTemplate( + assistant_header="<|start_header_id|>assistant<|end_header_id|>\n\n", + user_header="<|start_header_id|>user<|end_header_id|>", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|eot_id|>" + ) + ) + ``` + """ + + def __init__(self): + self.templates = {} + + def register(self, name: str, template: ChatTemplate, override: bool = False): + """ + Register a chat template for a model type. + + Args: + name(str): The name of the chat template. + template(ChatTemplate): The chat template. + override(bool): Whether to override the existing template, default to False + """ + assert ( + not override and name not in self.templates + ), f"Chat template for the model type {name} has already been registered" + self.templates[name] = template + + def get(self, name: str) -> ChatTemplate: + """ + Get the chat template for a model type. + + Args: + name(str): The name of the chat template. + + Returns: + ChatTemplate: The chat template. + """ + return self.templates[name] + + def get_all_template_names(self) -> List[str]: + """ + Get all the template names. + + Returns: + List[str]: The list of template names. + """ + return list(self.templates.keys()) + + +# global registry +TEMPLATE_REGISTRY = TemplateRegistry() + +# Register the common template here +TEMPLATE_REGISTRY.register( + name="llama3", + template=ChatTemplate( + assistant_header="<|start_header_id|>assistant<|end_header_id|>\n\n", + user_header="<|start_header_id|>user<|end_header_id|>", + system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.", + end_of_turn_token="<|eot_id|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="llama4", + template=ChatTemplate( + assistant_header="<|header_start|>assistant<|header_end|>\n\n", + user_header="<|header_start|>user<|header_end|>", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|eot|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="qwen", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>\n", + ), +) + +TEMPLATE_REGISTRY.register( + name="qwen2-vl", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>\n", + ), +) + +TEMPLATE_REGISTRY.register( + name="phi3", + template=ChatTemplate( + assistant_header="<|assistant|>\n", + user_header="<|user|>\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|end|>\n", + ), +) + +TEMPLATE_REGISTRY.register( + name="phi4", + template=ChatTemplate( + assistant_header="<|im_start|>assistant<|im_sep|>", + user_header="<|im_start|>user<|im_sep|>", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="phi4-mini", + template=ChatTemplate( + assistant_header="<|assistant|>", + user_header="<|user|>", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|end|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="gpt-oss-naive", + template=ChatTemplate( + assistant_header="<|start|>assistant<|channel|>analysis<|message|>", + user_header="<|start|>user<|message|>", + system_prompt=None, + end_of_turn_token="<|end|>", + ), +) + + +TEMPLATE_REGISTRY.register( + name="gpt-oss", + template=ChatTemplate( + assistant_header=None, # the headers are not applicable to openai-harmony's channel tags + user_header=None, + system_prompt=None, + end_of_turn_token=None, + parser_type="openai-harmony", + ), +) + +TEMPLATE_REGISTRY.register( + name="deepseek-r1-distill", + template=ChatTemplate( + assistant_header="<|Assistant|>", + user_header="<|User|>", + end_of_turn_token=None, + system_prompt=None, + ), +) + +TEMPLATE_REGISTRY.register( + name="qwen3-thinking", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>\n", + parser_type="thinking", + enable_thinking=True, + ), +) + + +TEMPLATE_REGISTRY.register( + name="qwen3-instruct", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>\n", + ignore_token=["\n\n\n\n"], + ), +) + +TEMPLATE_REGISTRY.register( + name="qwen3-next-thinking", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n\n", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>\n", + parser_type="thinking", + enable_thinking=True, + ), +) + +TEMPLATE_REGISTRY.register( + name="kimi-k2-thinking", + template=ChatTemplate( + assistant_header="<|im_assistant|>assistant<|im_middle|>", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>", + parser_type="thinking", + enable_thinking=True, + ), +) + +TEMPLATE_REGISTRY.register( + name="kimi-k2-instruct", + template=ChatTemplate( + assistant_header="<|im_assistant|>assistant<|im_middle|>", + user_header="<|im_start|>user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|im_end|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="deepseek-v3", + template=ChatTemplate( + assistant_header="<|Assistant|>", + user_header="<|User|>", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|end▁of▁sentence|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="ling-flash-2.0", + template=ChatTemplate( + assistant_header="ASSISTANT", + user_header="HUMAN", + system_prompt="You are a helpful assistant.", + end_of_turn_token="<|role_end|>", + ), +) + +TEMPLATE_REGISTRY.register( + name="deepseek-v32", + template=ChatTemplate( + assistant_header="<|Assistant|>", + user_header="<|User|>", + system_prompt="", + end_of_turn_token="<|end▁of▁sentence|>", + parser_type="thinking", + enable_thinking=True, + ), +) + +TEMPLATE_REGISTRY.register( + name="gemma", + template=ChatTemplate( + assistant_header="model\n", + user_header="user\n", + system_prompt="You are a helpful assistant.", + end_of_turn_token="\n", + ), +) + +TEMPLATE_REGISTRY.register( + name="longcat", + template=ChatTemplate( + assistant_header=" ASSISTANT:", + user_header=" USER:", + system_prompt="You are a helpful assistant.", + end_of_turn_token="", + assistant_pattern_type="longcat", + ), +) + +TEMPLATE_REGISTRY.register( + name="longcat_xml", + template=ChatTemplate( + assistant_header="", + user_header="", + system_prompt="You are a helpful assistant.", + end_of_turn_token="", + ), +) + + +TEMPLATE_REGISTRY.register( + name="qwen3.5", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n\n", + user_header="<|im_start|>user\n", + system_prompt="", + end_of_turn_token="<|im_end|>\n", + parser_type="thinking", + enable_thinking=True, + ), +) diff --git a/idea1/specforge/data/utils.py b/idea1/specforge/data/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..93fd6f58a6adf06051a14a58e5999bc35dad4dd4 --- /dev/null +++ b/idea1/specforge/data/utils.py @@ -0,0 +1,347 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in HuggingFace Transformers. +# Portions of this code are adapted from: +# - https://github.com/SafeAILab/EAGLE (Apache License 2.0) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from typing import Any, Dict, List, Optional + +import torch +import torch.distributed as dist +from torch.utils.data import DataLoader, DistributedSampler + +from datasets import Dataset +from specforge.distributed import get_draft_sp_group, get_sp_ulysses_group + + +class DataCollatorWithPadding: + """ + Datacollator that will dynamically pad the inputs for batching. + """ + + def __init__(self): + self.sp_degree = torch.distributed.get_world_size(get_draft_sp_group()) + self.ulysses_degree = torch.distributed.get_world_size(get_sp_ulysses_group()) + + def paddingtensor(self, intensors: torch.Tensor, N: int) -> torch.Tensor: + """ + Pad to the longest sequence in the batch. + + Args: + intensors: (B, n, S) + N: the length to pad to, N >= n + + Returns: + outtensors: (B, N, S) + """ + B, n, S = intensors.shape + padding_tensor = torch.zeros( + B, N - n, S, dtype=intensors.dtype, device=intensors.device + ) + outtensors = torch.cat((intensors, padding_tensor), dim=1) + return outtensors + + def paddingtensor2D(self, intensors: torch.Tensor, N: int) -> torch.Tensor: + """ + Pad 2D tensor to the longest sequence in the batch. + + Args: + intensors: (B, n) + N: the length to pad to, N >= n + + Returns: + outtensors: (B, N) + """ + B, n = intensors.shape + padding_tensor = torch.zeros( + B, N - n, dtype=intensors.dtype, device=intensors.device + ) + outtensors = torch.cat((intensors, padding_tensor), dim=1) + return outtensors + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Collate a batch of features. + + Args: + features: A list of features, where each feature is a dictionary containing: + - input_ids: torch.Tensor of shape (n,) + - attention_mask: torch.Tensor of shape (n,) + - loss_mask: torch.Tensor of shape (n,) + + Returns: + A dictionary containing: + - input_ids: torch.Tensor of shape (B, N) + - attention_mask: torch.Tensor of shape (B, N) + - loss_mask: torch.Tensor of shape (B, N) + """ + max_length = max(item["input_ids"].shape[1] for item in features) + + # pad for sequence parrel + max_length = ( + (max_length + self.sp_degree - 1) // self.sp_degree + ) * self.sp_degree + # position max len, ulysses do not need chuck position ids + position_max_len = max_length * self.ulysses_degree + + batch_input_ids = torch.cat( + [self.paddingtensor2D(item["input_ids"], max_length) for item in features] + ) + batch_attention_mask = torch.cat( + [ + self.paddingtensor2D(item["attention_mask"], max_length) + for item in features + ] + ) + batch_loss_mask = torch.cat( + [self.paddingtensor2D(item["loss_mask"], max_length) for item in features] + ) + if "position_ids" in features[0]: + batch_position_ids = torch.cat( + [ + self.paddingtensor2D(item["position_ids"], position_max_len) + for item in features + ] + ) + else: + batch_position_ids = None + batch = { + "input_ids": batch_input_ids, + "attention_mask": batch_attention_mask, + "loss_mask": batch_loss_mask, + "hidden_state": None, + "target": None, + } + if batch_position_ids is not None: + batch["position_ids"] = batch_position_ids + if all("hidden_state" in item for item in features): + assert all( + "target" in item for item in features + ), "target is required when hidden_state is provided" + if self.sp_degree > 1: # USP mode + batch["hidden_state"] = torch.cat( + [item["hidden_state"] for item in features] + ) + else: + batch["hidden_state"] = torch.cat( + [ + self.paddingtensor(item["hidden_state"], max_length) + for item in features + ] + ) + batch["target"] = torch.cat( + [self.paddingtensor(item["target"], max_length) for item in features] + ) + return batch + + +class VlmDataCollatorWithPadding: + """ + Datacollator that will dynamically pad the inputs for batching. + """ + + def paddingtensor(self, intensors: torch.Tensor, N: int) -> torch.Tensor: + """ + Pad to the longest sequence in the batch. + + Args: + intensors: (B, n, S) + N: the length to pad to, N >= n + + Returns: + outtensors: (B, N, S) + """ + B, n, S = intensors.shape + padding_tensor = torch.zeros(B, N - n, S, dtype=intensors.dtype) + outtensors = torch.cat((intensors, padding_tensor), dim=1) + return outtensors + + def paddingtensor2D(self, intensors: torch.Tensor, N: int) -> torch.Tensor: + """ + Pad 2D tensor to the longest sequence in the batch. + + Args: + intensors: (B, n) + N: the length to pad to, N >= n + + Returns: + outtensors: (B, N) + """ + B, n = intensors.shape + padding_tensor = torch.zeros(B, N - n, dtype=intensors.dtype) + outtensors = torch.cat((intensors, padding_tensor), dim=1) + return outtensors + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Collate a batch of features. + + Args: + features: A list of features, where each feature is a dictionary containing: + - input_ids: torch.Tensor of shape (n,) + - attention_mask: torch.Tensor of shape (n,) + - loss_mask: torch.Tensor of shape (n,) + - pixel_values: torch.Tensor of shape (grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size) + - image_grid_thw: torch.Tensor of shape (3,) + + Returns: + A dictionary containing: + - input_ids: torch.Tensor of shape (B, N) + - attention_mask: torch.Tensor of shape (B, N) + - loss_mask: torch.Tensor of shape (B, N) + """ + max_length = max(item["input_ids"].shape[1] for item in features) + batch_input_ids = torch.cat( + [self.paddingtensor2D(item["input_ids"], max_length) for item in features] + ) + batch_attention_mask = torch.cat( + [ + self.paddingtensor2D(item["attention_mask"], max_length) + for item in features + ] + ) + batch_loss_mask = torch.cat( + [self.paddingtensor2D(item["loss_mask"], max_length) for item in features] + ) + batch_pixel_values = torch.cat( + [item["pixel_values"] for item in features], dim=0 + ) + batch_image_grid_thw = torch.cat( + [item["image_grid_thw"] for item in features], dim=0 + ) + batch = { + "input_ids": batch_input_ids, + "attention_mask": batch_attention_mask, + "loss_mask": batch_loss_mask, + "pixel_values": batch_pixel_values, + "image_grid_thw": batch_image_grid_thw, + "hidden_state": None, + "target": None, + } + if all("hidden_state" in item for item in features): + assert all( + "target" in item for item in features + ), "target is required when hidden_state is provided" + batch["hidden_state"] = torch.cat( + [ + self.paddingtensor(item["hidden_state"], max_length) + for item in features + ] + ) + batch["target"] = torch.cat( + [self.paddingtensor(item["target"], max_length) for item in features] + ) + return batch + + +def prepare_dp_dataloaders( + dataset: Dataset, + batch_size: int, + num_workers: int = 4, + process_group: Optional[dist.ProcessGroup] = None, + pin_memory: Optional[bool] = False, + shuffle: Optional[bool] = False, + is_vlm: Optional[bool] = False, + prefetch_factor: Optional[int] = 2, + **dataloader_kwargs, +) -> DataLoader: + """ + Prepare dataloader for distributed data parallel training. + + Args: + dataset: The dataset to load data from. + batch_size: The batch size for each GPU. + num_workers: The number of workers for data loading. + process_group: The process group for distributed training. + pin_memory: Whether to pin memory for data loading. + shuffle: Whether to shuffle the dataset. + is_vlm: Whether the dataset is a vision-language model dataset. + **dataloader_kwargs: Additional keyword arguments for the DataLoader. + + Returns: + A DataLoader for the dataset. + """ + world_size = dist.get_world_size(process_group) + rank = dist.get_rank(process_group) + sampler = DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=shuffle + ) + if is_vlm: + datacollator_cls = VlmDataCollatorWithPadding + else: + datacollator_cls = DataCollatorWithPadding + + if num_workers == 0: + prefetch_factor = None + + dataloader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + pin_memory=pin_memory, + prefetch_factor=prefetch_factor, + collate_fn=datacollator_cls(), + drop_last=True, + **dataloader_kwargs, + ) + return dataloader + + +def parse_harmony_message_content(content): + """ + 解析 content 字符串中的 Harmony 格式。 + 如果匹配到 Harmony 格式,返回包含 channel 和 content 的列表; + 否则,返回原内容并标记为默认 channel。 + """ + # 匹配 <|channel|>xxx<|message|>yyy<|end|> + pattern = r"<\|channel\|>(.*?)<\|message\|>(.*?)<\|end|>" + matches = re.findall(pattern, content, re.DOTALL) + + if not matches: + # 如果没有匹配到 Harmony 标签,视作普通文本 + return [{"channel": "text", "content": content}] + + results = [] + for channel, msg_body in matches: + results.append({"channel": channel.strip(), "content": msg_body.strip()}) + return results + + +def process_harmony_conversations(conversation): + """ + 处理传入的 list[list[dict]] 结构 + """ + new_conversation = [] + for msg in conversation: + role = msg.get("role") + original_content = msg.get("content", "") + + # 解析 content 中的 Harmony 结构 + segments = parse_harmony_message_content(original_content) + + # 为每个解析出的通道生成一个新的消息字典 + for seg in segments: + new_msg = { + "role": role, + "channel": seg["channel"], # 新增字段标识通道 + "content": seg["content"], + } + new_conversation.append(new_msg) + + return new_conversation diff --git a/idea1/specforge/distributed.py b/idea1/specforge/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5e882c4d69bc2cf8e03afe4fc05f3d60bdc3c6 --- /dev/null +++ b/idea1/specforge/distributed.py @@ -0,0 +1,245 @@ +from datetime import timedelta +from typing import Any, Optional + +import torch +import torch.distributed as dist +from yunchang.globals import PROCESS_GROUP, set_seq_parallel_pg + +from specforge.utils import print_with_rank + +_DEVICE_MESH = None +_TP_DEVICE_MESH = None +_TP_GROUP = None +_DP_DEVICE_MESH = None +_DP_GROUP = None +_DRAFT_DP_GROUP = None +_DRAFT_SP_GROUP = None +_SP_ULYSSES_GROUP = None +_SP_RING_GROUP = None + + +def get_tp_group(): + global _TP_GROUP + return _TP_GROUP + + +def get_dp_group(): + global _DP_GROUP + return _DP_GROUP + + +def get_draft_dp_group(): + global _DRAFT_DP_GROUP + return _DRAFT_DP_GROUP + + +def get_draft_sp_group(): + global _DRAFT_SP_GROUP + return _DRAFT_SP_GROUP + + +def get_device_mesh(): + global _DEVICE_MESH + return _DEVICE_MESH + + +def get_tp_device_mesh(): + global _TP_DEVICE_MESH + return _TP_DEVICE_MESH + + +def get_dp_device_mesh(): + global _DP_DEVICE_MESH + return _DP_DEVICE_MESH + + +def get_sp_ulysses_group(): + global _SP_ULYSSES_GROUP + return _SP_ULYSSES_GROUP + + +def get_sp_ring_group(): + global _SP_RING_GROUP + return _SP_RING_GROUP + + +def init_distributed( + timeout: int = 10, tp_size: int = 1, sp_ulysses_size: int = 1, sp_ring_size: int = 1 +): + """Initialize distributed training. + + Args: + timeout(int): Timeout for collective communication in minutes + tp_size(int): The degree of tensor parallelism + """ + dist.init_process_group(backend="nccl", timeout=timedelta(minutes=timeout)) + local_rank = dist.get_rank() % torch.cuda.device_count() + torch.cuda.set_device(local_rank) + print_with_rank(f"bind to device {local_rank}") + + world_size = dist.get_world_size() + dp_size = world_size // tp_size + assert ( + world_size == tp_size * dp_size + ), f"world size must be divisible by tp size, now {world_size=}, {(tp_size * dp_size)=} " + + device_mesh = dist.device_mesh.init_device_mesh( + "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp") + ) + + assert ( + world_size % (sp_ulysses_size * sp_ring_size) == 0 + ), f"World size ({world_size}) cannot be evenly divided by total SP size ({sp_ulysses_size*sp_ring_size})" + + draft_dp_size = world_size // (sp_ulysses_size * sp_ring_size) + draft_device_mesh = dist.device_mesh.init_device_mesh( + "cuda", + (draft_dp_size, sp_ulysses_size * sp_ring_size), + mesh_dim_names=("draft_dp", "sp"), + ) + set_seq_parallel_pg(sp_ulysses_size, sp_ring_size, dist.get_rank(), world_size) + + print_with_rank(f"device mesh: {device_mesh}") + tp_group = device_mesh.get_group("tp") + dp_group = device_mesh.get_group("dp") + + sp_ulysses_group = PROCESS_GROUP.ULYSSES_PG + sp_ring_group = PROCESS_GROUP.RING_PG + # we need to create a 1D submesh + tp_device_mesh = dist.DeviceMesh.from_group(tp_group, device_type="cuda") + + global _TP_GROUP, _DP_GROUP, _DEVICE_MESH, _TP_DEVICE_MESH, _DP_DEVICE_MESH, _SP_RING_GROUP, _SP_ULYSSES_GROUP, _DRAFT_DP_GROUP, _DRAFT_SP_GROUP + _DEVICE_MESH = device_mesh + _TP_GROUP = tp_group + _TP_DEVICE_MESH = tp_device_mesh + _SP_ULYSSES_GROUP = sp_ulysses_group + _SP_RING_GROUP = sp_ring_group + _DP_GROUP = dp_group + _DRAFT_DP_GROUP = draft_device_mesh.get_group("draft_dp") + _DRAFT_SP_GROUP = draft_device_mesh.get_group("sp") + _DP_DEVICE_MESH = dist.DeviceMesh.from_group(dp_group, device_type="cuda") + + +def destroy_distributed(): + global _TP_GROUP, _DP_GROUP, _SP_ULYSSES_GROUP, _SP_RING_GROUP, _DRAFT_DP_GROUP + dist.destroy_process_group(_TP_GROUP) + dist.destroy_process_group(_DP_GROUP) + dist.destroy_process_group(_SP_ULYSSES_GROUP) + dist.destroy_process_group(_SP_RING_GROUP) + dist.destroy_process_group(_DRAFT_DP_GROUP) + dist.destroy_process_group(_DRAFT_SP_GROUP) + dist.destroy_process_group() + + +def shard_tensor( + tensor: torch.Tensor, process_group: dist.ProcessGroup = None, dim: int = -1 +) -> torch.Tensor: + rank = dist.get_rank(process_group) + size = dist.get_world_size(process_group) + return tensor.chunk(size, dim=dim)[rank].contiguous() + + +def gather_tensor( + tensor: torch.Tensor, process_group: dist.ProcessGroup = None, dim: int = -1 +) -> torch.Tensor: + size = dist.get_world_size(process_group) + obj_list = [torch.empty_like(tensor) for _ in range(size)] + dist.all_gather(obj_list, tensor, group=process_group) + gather_tensor = torch.cat(obj_list, dim=dim) + return gather_tensor + + +def all_gather_tensor( + local_tensor: torch.Tensor, + group: Optional[dist.ProcessGroup] = None, + async_op: bool = False, +): + sp_world_size = dist.get_world_size(group=group) + output_shape = list(local_tensor.shape) + output_shape[0] = output_shape[0] * sp_world_size + output = torch.empty( + output_shape, dtype=local_tensor.dtype, device=local_tensor.device + ) + dist.all_gather_into_tensor(output, local_tensor, group=group, async_op=async_op) + return output + + +# Adapted from https://github.com/volcengine/verl/blob/a0e8e4472b8b472409defb0c8fcc5162301450af/verl/utils/ulysses.py#L194 +class Gather(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + group: dist.ProcessGroup, + local_tensor: torch.Tensor, + gather_dim: int, + grad_scaler: bool = True, + async_op=False, + ) -> torch.Tensor: + ctx.group = group + ctx.gather_dim = gather_dim + ctx.grad_scaler = grad_scaler + ctx.async_op = async_op + + sp_world_size = dist.get_world_size(group=group) + ctx.sp_world_size = sp_world_size + + sp_rank = dist.get_rank(group=group) + ctx.sp_rank = sp_rank + + local_shape = list(local_tensor.size()) + split_size = local_shape[0] + part_size = local_shape[gather_dim] # store original size + ctx.part_size = part_size + + output = all_gather_tensor(local_tensor, group, async_op) + return torch.cat(output.split(split_size, dim=0), dim=gather_dim) + + @staticmethod + def backward(ctx: Any, grad_output: torch.Tensor) -> Any: + if ctx.grad_scaler: + grad_output = grad_output * ctx.sp_world_size + return ( + None, + grad_output.split(ctx.part_size, dim=ctx.gather_dim)[ + ctx.sp_rank + ].contiguous(), + None, + None, + None, + None, + ) + + +def gather_outputs_and_unpad( + x: torch.Tensor, + gather_dim: int, + grad_scaler: bool = True, + group: Optional[dist.ProcessGroup] = None, +): + """ + Gather a tensor across a process group and optionally unpad its padded elements. + + Args: + x (Tensor): Input tensor to gather. + gather_dim (int): Dimension along which to gather across ranks. + grad_scaler (bool): Whether to apply gradient scaling during gather. Defaults to True. + group (ProcessGroup, optional): Process group for gathering. If None, uses + `get_ulysses_sequence_parallel_group()`. If still None, returns `x` unchanged. + + Returns: + Tensor: The gathered tensor, with padding removed if requested. + """ + if not group: + group = get_draft_sp_group() + if torch.distributed.get_world_size(group) == 1: + return x + x = Gather.apply(group, x, gather_dim, grad_scaler) + return x + + +def is_tp_rank_0(): + """Return True if current process is rank 0 in its TP group.""" + tp_group = get_tp_group() + if tp_group is None: + return True + return dist.get_rank(group=tp_group) == 0 diff --git a/idea1/specforge/layers/__init__.py b/idea1/specforge/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b71718d39de7248cd0c33732c920f129ddd40001 --- /dev/null +++ b/idea1/specforge/layers/__init__.py @@ -0,0 +1,10 @@ +from .embedding import VocabParallelEmbedding +from .linear import ColumnParallelLinear, RowParallelLinear +from .lm_head import ParallelLMHead + +__all__ = [ + "VocabParallelEmbedding", + "ColumnParallelLinear", + "RowParallelLinear", + "ParallelLMHead", +] diff --git a/idea1/specforge/layers/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/layers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be94290fe03e3bb37ba12a5b6e45fb4a7b0796e8 Binary files /dev/null and b/idea1/specforge/layers/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/layers/__pycache__/embedding.cpython-311.pyc b/idea1/specforge/layers/__pycache__/embedding.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1de493b8629e47110dd87f7c238eff33339d8b68 Binary files /dev/null and b/idea1/specforge/layers/__pycache__/embedding.cpython-311.pyc differ diff --git a/idea1/specforge/layers/__pycache__/linear.cpython-311.pyc b/idea1/specforge/layers/__pycache__/linear.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9088835a09c9a8188ee7ac83796782695bd9b96 Binary files /dev/null and b/idea1/specforge/layers/__pycache__/linear.cpython-311.pyc differ diff --git a/idea1/specforge/layers/__pycache__/lm_head.cpython-311.pyc b/idea1/specforge/layers/__pycache__/lm_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..560606e64981bd3cbcfe7ec076b19ba4af5aaf23 Binary files /dev/null and b/idea1/specforge/layers/__pycache__/lm_head.cpython-311.pyc differ diff --git a/idea1/specforge/layers/embedding.py b/idea1/specforge/layers/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..336d5776c5d1585a0a8ec0f3a24d2458d77703aa --- /dev/null +++ b/idea1/specforge/layers/embedding.py @@ -0,0 +1,132 @@ +import math +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F + +from specforge.distributed import get_tp_group, shard_tensor + + +class VocabParallelEmbedding(nn.Module): + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int] = None, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + + if padding_idx is not None: + if padding_idx > 0: + assert ( + padding_idx < self.num_embeddings + ), "Padding_idx must be within num_embeddings" + elif padding_idx < 0: + assert ( + padding_idx >= -self.num_embeddings + ), "Padding_idx must be within num_embeddings" + padding_idx = self.num_embeddings + padding_idx + + # tp-realted + self.tp_group = get_tp_group() + self.tp_rank = dist.get_rank(self.tp_group) + self.tp_size = dist.get_world_size(self.tp_group) + + # deal with the case where the embedding is not divisible by the TP size + self.num_embeddings_per_shard = math.ceil(num_embeddings / self.tp_size) + self.padded_num_embeddings = ( + self.num_embeddings_per_shard * self.tp_size - self.num_embeddings + ) + self.vocab_start_index = self.tp_rank * self.num_embeddings_per_shard + self.vocab_end_index = min( + self.vocab_start_index + self.num_embeddings_per_shard, + self.num_embeddings, + ) + + if ( + padding_idx is not None + and padding_idx >= self.vocab_start_index + and padding_idx < self.vocab_end_index + ): + self.padding_idx = padding_idx - self.vocab_start_index + else: + self.padding_idx = None + + self.weight = nn.Parameter( + torch.empty( + (self.num_embeddings_per_shard, self.embedding_dim), **factory_kwargs + ), + requires_grad=True, + ) + self.reset_parameters() + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + if "weight" in state_dict: + value = state_dict["weight"] + + # pad this value if it is not divisible by the TP size + if value.shape[0] % self.tp_size != 0: + padding_size = self.tp_size - value.shape[0] % self.tp_size + value = F.pad(value, (0, 0, 0, padding_size)) + state_dict["weight"] = shard_tensor(value, self.tp_group, 0) + + def reset_parameters(self) -> None: + torch.nn.init.normal_(self.weight) + self._fill_padding_idx_with_zero() + + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + with torch.no_grad(): + self.weight[self.padding_idx].fill_(0) + + def generate_mask(self, input_): + # generate the mask for the vocab which is only owned by the current rank + mask = (input_ >= self.vocab_start_index) & (input_ < self.vocab_end_index) + return mask + + def forward(self, input_): + if self.tp_size > 1: + # Build the mask. + mask = self.generate_mask(input_) + masked_input = input_ - self.vocab_start_index + masked_input[~mask] = 0 + else: + masked_input = input_ + + output_parallel = F.embedding( + masked_input, + self.weight, + padding_idx=self.padding_idx, + max_norm=self.max_norm, + norm_type=self.norm_type, + scale_grad_by_freq=self.scale_grad_by_freq, + sparse=self.sparse, + ) + + # Mask the output embedding. + if self.tp_size > 1: + output_parallel[~mask] = 0 + # Reduce across all the model parallel GPUs. + dist.all_reduce(output_parallel, op=dist.ReduceOp.SUM, group=self.tp_group) + output = output_parallel + else: + output = output_parallel + return output diff --git a/idea1/specforge/layers/linear.py b/idea1/specforge/layers/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c512d2139d32b233efa5795a234a1426d0e5e4 --- /dev/null +++ b/idea1/specforge/layers/linear.py @@ -0,0 +1,204 @@ +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F + +from specforge.distributed import get_tp_group, shard_tensor + + +class RowParallelLinear(nn.Module): + def __init__( + self, + in_features, + out_features, + bias=True, + device=None, + dtype=None, + kv_head_replicas=False, + layout_type: str = "normal", + ): + super().__init__() + factory_kwargs = {"device": device, "dtype": dtype} + self.layout_type = layout_type + self.tp_group = get_tp_group() + self.tp_size = dist.get_world_size(self.tp_group) + self.tp_rank = dist.get_rank(self.tp_group) + + self.in_features = in_features + self.out_features = out_features + + if kv_head_replicas: + self.in_features_per_shard = in_features + else: + self.in_features_per_shard = in_features // self.tp_size + self.weight = nn.Parameter( + torch.empty(self.out_features, self.in_features_per_shard, **factory_kwargs) + ) + if bias: + self.bias = nn.Parameter(torch.empty(self.out_features, **factory_kwargs)) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + """ + This is a state dict hook to be triggered before loading the state dict. This will shard the weights and biases according to the layout type. + """ + if self.layout_type == "normal": + self.handle_normal_layout(state_dict, *args) + else: + raise ValueError(f"Invalid layout type: {self.layout_type}") + + def handle_normal_layout(self, state_dict, *args): + # shard the weights + if "weight" in state_dict: + state_dict["weight"] = shard_tensor(state_dict["weight"], self.tp_group, -1) + + if "bias" in state_dict and self.tp_rank != 0: + state_dict["bias"] = torch.zeros_like(state_dict["bias"]) + + def forward(self, x): + return F.linear(x, self.weight, self.bias) + + def reset_parameters(self): + nn.init.xavier_normal_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self): + return f"RowParallelLinear(in_features={self.in_features_per_shard}, out_features={self.out_features}, tp_size={self.tp_size}, tp_rank={self.tp_rank})" + + +class ColumnParallelLinear(nn.Module): + def __init__( + self, + in_features, + out_features, + bias=True, + device=None, + dtype=None, + layout_type: str = "normal", + kv_head_replicas=False, + kv_head_idx=None, + total_num_kv_heads=None, + ): + super().__init__() + factory_kwargs = {"device": device, "dtype": dtype} + self.layout_type = layout_type + self.tp_group = get_tp_group() + self.tp_size = dist.get_world_size(self.tp_group) + self.tp_rank = dist.get_rank(self.tp_group) + + self.in_features = in_features + self.out_features = out_features + self.kv_head_replicas = kv_head_replicas + self.kv_head_idx = kv_head_idx + self.total_num_kv_heads = total_num_kv_heads + if self.kv_head_replicas: + self.out_features_per_shard = out_features + else: + self.out_features_per_shard = out_features // self.tp_size + + self.weight = nn.Parameter( + torch.empty(self.out_features_per_shard, self.in_features, **factory_kwargs) + ) + if bias: + self.bias = nn.Parameter( + torch.empty(self.out_features_per_shard, **factory_kwargs) + ) + else: + self.register_parameter("bias", None) + self.reset_parameters() + + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + """ + This is a state dict hook to be triggered before loading the state dict. This will shard the weights and biases according to the layout type. + """ + if self.kv_head_replicas: + assert self.kv_head_idx is not None + assert self.layout_type == "normal" + self.handle_kv_head_replicas(state_dict, *args) + else: + if self.layout_type == "normal": + self.handle_normal_layout(state_dict, *args) + elif self.layout_type == "merged_qkv": + self.handle_merged_qkv(state_dict, *args) + elif self.layout_type == "gate_up": + self.handle_gate_up_layout(state_dict, *args) + else: + raise ValueError(f"Invalid layout type: {self.layout_type}") + + def handle_kv_head_replicas(self, state_dict, *args): + """ + This is a special case for GQA where the key/value are split according to the number of kv heads and the head which belongs to this rank. + As the TP size is larger than the number of kv heads, we only keep one kv head per rank. + """ + if "weight" in state_dict: + state_dict["weight"] = state_dict["weight"].chunk( + self.total_num_kv_heads, dim=0 + )[self.kv_head_idx] + if "bias" in state_dict and state_dict["bias"] is not None: + state_dict["bias"] = state_dict["bias"].chunk( + self.total_num_kv_heads, dim=0 + )[self.kv_head_idx] + + def handle_normal_layout(self, state_dict, *args): + """ + This shards the weights and biases along the column dimension. + """ + # shard the weights + if "weight" in state_dict: + state_dict["weight"] = shard_tensor(state_dict["weight"], self.tp_group, 0) + + if "bias" in state_dict and state_dict["bias"] is not None: + state_dict["bias"] = shard_tensor(state_dict["bias"], self.tp_group, 0) + + def handle_gate_up_layout(self, state_dict, *args): + """ + This handles the gate_up layout where the gate and up weights are concatenated along the column dimension. + """ + if "weight" in state_dict: + gate, up = state_dict["weight"].chunk(2, dim=0) + gate = shard_tensor(gate, self.tp_group, 0) + up = shard_tensor(up, self.tp_group, 0) + state_dict["weight"] = torch.cat((gate, up), dim=0) + + if "bias" in state_dict and state_dict["bias"] is not None: + gate, up = state_dict["bias"].chunk(2, dim=0) + gate = shard_tensor(gate, self.tp_group, 0) + up = shard_tensor(up, self.tp_group, 0) + state_dict["bias"] = torch.cat((gate, up), dim=0) + + def handle_merged_qkv(self, state_dict, *args): + """ + This handles the merged QKV layout where the q, k, v weights are concatenated along the column dimension. + """ + if "weight" in state_dict: + # need to split into qkv and take the correct chunk for the rank + q, k, v = state_dict["weight"].chunk(3, dim=0) + q = shard_tensor(q, self.tp_group, 0) + k = shard_tensor(k, self.tp_group, 0) + v = shard_tensor(v, self.tp_group, 0) + state_dict["weight"] = torch.cat((q, k, v), dim=0) + + if "bias" in state_dict and state_dict["bias"] is not None: + q, k, v = state_dict["bias"].chunk(3, dim=0) + q = shard_tensor(q, self.tp_group, 0) + k = shard_tensor(k, self.tp_group, 0) + v = shard_tensor(v, self.tp_group, 0) + state_dict["bias"] = torch.cat((q, k, v), dim=0) + + def forward(self, x): + return F.linear(x, self.weight, self.bias) + + def reset_parameters(self): + nn.init.xavier_normal_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self): + return f"ColumnParallelLinear(in_features={self.in_features}, out_features={self.out_features_per_shard}, tp_size={self.tp_size}, tp_rank={self.tp_rank})" diff --git a/idea1/specforge/layers/lm_head.py b/idea1/specforge/layers/lm_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f1d50b089da761b2b85396fbf8b40aa1a5d65133 --- /dev/null +++ b/idea1/specforge/layers/lm_head.py @@ -0,0 +1,109 @@ +import math +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F + +from specforge.distributed import get_tp_group, shard_tensor + + +class ParallelLMHead(nn.Module): + + def __init__( + self, + in_features: int, + out_features: int, + *, + bias: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + super().__init__() + factory_kwargs = {"device": device, "dtype": dtype} + self.in_features = in_features + self.out_features = out_features + self.tp_group = get_tp_group() + self.tp_size = dist.get_world_size(self.tp_group) + self.tp_rank = dist.get_rank(self.tp_group) + + # tp-related + self.out_features_per_shard = math.ceil(out_features / self.tp_size) + self.padded_out_features = ( + self.out_features_per_shard * self.tp_size - out_features + ) + assert ( + self.out_features_per_shard * self.tp_size + == out_features + self.padded_out_features + ) + + self.weight = nn.Parameter( + torch.empty(self.out_features_per_shard, self.in_features, **factory_kwargs) + ) + self.bias = ( + nn.Parameter(torch.zeros(self.out_features_per_shard, **factory_kwargs)) + if bias + else None + ) + + # init params + self.reset_parameters() + + # handle weight loading + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + if "weight" in state_dict: + value = state_dict["weight"] + + # pad this value if it is not divisible by the TP size + if value.shape[0] % self.tp_size != 0: + padding_size = self.tp_size - value.shape[0] % self.tp_size + value = F.pad(value, (0, 0, 0, padding_size)) + state_dict["weight"] = shard_tensor(value, self.tp_group, 0) + + if "bias" in state_dict: + value = state_dict["bias"] + + # pad this value if it is not divisible by the TP size + if value.shape[0] % self.tp_size != 0: + padding_size = self.tp_size - value.shape[0] % self.tp_size + value = F.pad(value, (0, padding_size)) + state_dict["bias"] = shard_tensor(value, self.tp_group, 0) + + def forward(self, hidden: torch.Tensor, gather_output: bool = False): + """ + hidden: [B, T, H] or [N, H] + returns: + - if gather_output=False: local logits [*, local_vocab] and (start,end) for stitching + - if gather_output=True: full logits [*, vocab] via all-gather (use for inference) + """ + orig_shape = hidden.shape + hidden = hidden.reshape(-1, self.in_features) # [N, H] + + local_logits = hidden @ self.weight.T # [N, local_vocab] + + if self.bias is not None: + local_logits = local_logits + self.bias + + if not gather_output or self.tp_size == 1: + return local_logits.view( + *orig_shape[:-1], self.out_features_per_shard + ).contiguous() + else: + # all-gather shards along vocab dim + chunks = [torch.empty_like(local_logits) for _ in range(self.tp_size)] + dist.all_gather(chunks, local_logits, group=self.tp_group) + full = torch.cat(chunks, dim=-1)[ + :, : self.out_features + ] # trim padding from ceil-div + return full.view(*orig_shape[:-1], self.out_features).contiguous() + + def reset_parameters(self): + nn.init.xavier_normal_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self): + return f"ParallelLMHead(in_features={self.in_features}, out_features={self.out_features_per_shard}, tp_size={self.tp_size}, tp_rank={self.tp_rank})" diff --git a/idea1/specforge/layers/ring/__init__.py b/idea1/specforge/layers/ring/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0a04a8f5eae08db74b9697dd8d1e2ae946edc9 --- /dev/null +++ b/idea1/specforge/layers/ring/__init__.py @@ -0,0 +1,12 @@ +# adapt from https://github.com/feifeibear/long-context-attention/tree/main/yunchang +from .ring_flash_attn import ( + ring_flash_attn_func, + ring_flash_attn_kvpacked_func, + ring_flash_attn_qkvpacked_func, +) + +__all__ = [ + "ring_flash_attn_func", + "ring_flash_attn_kvpacked_func", + "ring_flash_attn_qkvpacked_func", +] diff --git a/idea1/specforge/layers/ring/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/layers/ring/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdd3480bc0abfcb364aa425b5606a6b69b975efb Binary files /dev/null and b/idea1/specforge/layers/ring/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/layers/ring/__pycache__/ring_flash_attn.cpython-311.pyc b/idea1/specforge/layers/ring/__pycache__/ring_flash_attn.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7718ac7bbb30d8b5fc302dc8a0abcff38c1d4b72 Binary files /dev/null and b/idea1/specforge/layers/ring/__pycache__/ring_flash_attn.cpython-311.pyc differ diff --git a/idea1/specforge/layers/ring/__pycache__/utils.cpython-311.pyc b/idea1/specforge/layers/ring/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78652bf2465576baf865fe15b737608cc2ef9107 Binary files /dev/null and b/idea1/specforge/layers/ring/__pycache__/utils.cpython-311.pyc differ diff --git a/idea1/specforge/layers/ring/ring_flash_attn.py b/idea1/specforge/layers/ring/ring_flash_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..3c89b7e4a33431b83d4abe7bb64fd7b52a396523 --- /dev/null +++ b/idea1/specforge/layers/ring/ring_flash_attn.py @@ -0,0 +1,336 @@ +import torch +from yunchang.kernels import AttnType, select_flash_attn_impl + +from .utils import RingComm, update_out_and_lse + + +def ring_flash_attn_forward( + process_group, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + softmax_scale, + dropout_p=0, + causal=True, + window_size=(-1, -1), + softcap=0.0, + alibi_slopes=None, + deterministic=False, + attn_type: AttnType = AttnType.FA, + attn_processor=None, +): + comm = RingComm(process_group) + + out = None + lse = None + + next_k, next_v = None, None + + for step in range(comm.world_size): + if step + 1 != comm.world_size: + next_k: torch.Tensor = comm.send_recv(k) + next_v: torch.Tensor = comm.send_recv(v) + comm.commit() + + if not causal or step <= comm.rank: + fn = select_flash_attn_impl( + attn_type, stage="fwd-only", attn_processor=attn_processor + ) + block_out, block_lse = fn( + q, + k, + v, + dropout_p=dropout_p, + softmax_scale=softmax_scale, + causal=causal and step == 0, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + return_softmax=True and dropout_p > 0, + ) + if attn_type == AttnType.SPARSE_SAGE: + out, lse = block_out, block_lse + else: + out, lse = update_out_and_lse(out, lse, block_out, block_lse) + + if step + 1 != comm.world_size: + comm.wait() + k = next_k + v = next_v + + out = out.to(q.dtype) + if attn_type != AttnType.SPARSE_SAGE: + lse = lse.squeeze(dim=-1).transpose(1, 2) + return out, lse + + +def ring_flash_attn_backward( + process_group, + dout, + q, + k, + v, + out, + softmax_lse, + softmax_scale, + dropout_p=0, + causal=True, + window_size=(-1, -1), + softcap=0.0, + alibi_slopes=None, + deterministic=False, + attn_type: AttnType = AttnType.FA, +): + kv_comm = RingComm(process_group) + d_kv_comm = RingComm(process_group) + dq, dk, dv = None, None, None + next_dk, next_dv = None, None + + block_dq_buffer = torch.empty(q.shape, dtype=q.dtype, device=q.device) + block_dk_buffer = torch.empty(k.shape, dtype=k.dtype, device=k.device) + block_dv_buffer = torch.empty(v.shape, dtype=v.dtype, device=v.device) + + next_dk, next_dv = None, None + next_k, next_v = None, None + + for step in range(kv_comm.world_size): + if step + 1 != kv_comm.world_size: + next_k = kv_comm.send_recv(k) + next_v = kv_comm.send_recv(v) + kv_comm.commit() + if step <= kv_comm.rank or not causal: + bwd_causal = causal and step == 0 + fn = select_flash_attn_impl(attn_type, stage="bwd-only") + fn( + dout, + q, + k, + v, + out, + softmax_lse, + block_dq_buffer, + block_dk_buffer, + block_dv_buffer, + dropout_p, + softmax_scale, + bwd_causal, + window_size, + softcap, + alibi_slopes, + deterministic, + rng_state=None, + ) + + if dq is None: + dq = block_dq_buffer.to(torch.float32) + dk = block_dk_buffer.to(torch.float32) + dv = block_dv_buffer.to(torch.float32) + else: + dq += block_dq_buffer + d_kv_comm.wait() + dk = block_dk_buffer + next_dk + dv = block_dv_buffer + next_dv + elif step != 0: + d_kv_comm.wait() + dk = next_dk + dv = next_dv + + if step + 1 != kv_comm.world_size: + kv_comm.wait() + k = next_k + v = next_v + + next_dk = d_kv_comm.send_recv(dk) + next_dv = d_kv_comm.send_recv(dv) + d_kv_comm.commit() + + d_kv_comm.wait() + + return dq.to(torch.bfloat16), next_dk.to(q.dtype), next_dv.to(q.dtype) + + +class RingFlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_softmax, + group, + attn_type, + attn_processor, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + + assert alibi_slopes is None + k = k.contiguous() + v = v.contiguous() + out, softmax_lse = ring_flash_attn_forward( + group, + q, + k, + v, + softmax_scale=softmax_scale, + dropout_p=dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=False, + attn_type=attn_type, + attn_processor=attn_processor, + ) + # this should be out_padded + ctx.save_for_backward(q, k, v, out, softmax_lse) + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + ctx.group = group + ctx.attn_type = attn_type + ctx.attn_processor = attn_processor + return out if not return_softmax else (out, softmax_lse, None) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse = ctx.saved_tensors + dq, dk, dv = ring_flash_attn_backward( + ctx.group, + dout, + q, + k, + v, + out, + softmax_lse, + softmax_scale=ctx.softmax_scale, + dropout_p=ctx.dropout_p, + causal=ctx.causal, + window_size=ctx.window_size, + softcap=ctx.softcap, + alibi_slopes=ctx.alibi_slopes, + deterministic=ctx.deterministic, + attn_type=ctx.attn_type, + ) + return ( + dq, + dk, + dv, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + + +def ring_flash_attn_qkvpacked_func( + qkv, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), + softcap=0.0, + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, + group=None, + attn_type: AttnType = AttnType.FA, +): + return RingFlashAttnFunc.apply( + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + group, + attn_type, + ) + + +def ring_flash_attn_kvpacked_func( + q, + kv, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), + softcap=0.0, + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, + group=None, + attn_type: AttnType = AttnType.FA, +): + return RingFlashAttnFunc.apply( + q, + kv[:, :, 0], + kv[:, :, 1], + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + group, + attn_type, + ) + + +def ring_flash_attn_func( + q, + k, + v, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), + softcap=0.0, + alibi_slopes=None, + deterministic=False, + return_attn_probs=False, + group=None, + attn_type: AttnType = AttnType.FA, + attn_processor=None, +): + return RingFlashAttnFunc.apply( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + softcap, + alibi_slopes, + deterministic, + return_attn_probs, + group, + attn_type, + attn_processor, + ) diff --git a/idea1/specforge/layers/ring/utils.py b/idea1/specforge/layers/ring/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..14d6a7817dc9358d64e4b93465e1b1f33d870923 --- /dev/null +++ b/idea1/specforge/layers/ring/utils.py @@ -0,0 +1,119 @@ +from typing import Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn.functional as F + +__all__ = ["update_out_and_lse", "RingComm"] + + +@torch.jit.script +def _update_out_and_lse( + out: torch.Tensor, + lse: torch.Tensor, + block_out: torch.Tensor, + block_lse: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + + block_out = block_out.to(torch.float32) + block_lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1) + + # new_lse = lse + torch.log(1 + torch.exp(block_lse - lse)) + # torch.exp(lse - new_lse) * out + torch.exp(block_lse - new_lse) * block_out + # For additional context and discussion, please refer to: + # https://github.com/zhuzilin/ring-flash-attention/pull/34#issuecomment-2076126795 + out = out - F.sigmoid(block_lse - lse) * (out - block_out) + lse = lse - F.logsigmoid(lse - block_lse) + + return out, lse + + +def update_out_and_lse( + out: Optional[torch.Tensor], + lse: Optional[torch.Tensor], + block_out: torch.Tensor, + block_lse: torch.Tensor, + slice_=None, +) -> Tuple[torch.Tensor, torch.Tensor]: + if out is None: + if slice_ is not None: + raise RuntimeError("first update_out_and_lse should not pass slice_ args") + out = block_out.to(torch.float32) + lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1) + elif slice_ is not None: + slice_out, slice_lse = out[slice_], lse[slice_] + slice_out, slice_lse = _update_out_and_lse( + slice_out, slice_lse, block_out, block_lse + ) + out[slice_], lse[slice_] = slice_out, slice_lse + else: + out, lse = _update_out_and_lse(out, lse, block_out, block_lse) + return out, lse + + +@torch.jit.script +def flatten_varlen_lse(lse, cu_seqlens): + new_lse = [] + for i in range(len(cu_seqlens) - 1): + start, end = cu_seqlens[i], cu_seqlens[i + 1] + new_lse.append(lse[i, :, : end - start]) + return torch.cat(new_lse, dim=1) + + +@torch.jit.script +def unflatten_varlen_lse(lse, cu_seqlens, max_seqlen: int): + num_seq = len(cu_seqlens) - 1 + num_head = lse.shape[-2] + new_lse = torch.empty( + (num_seq, max_seqlen, num_head, 1), dtype=torch.float32, device=lse.device + ) + for i in range(num_seq): + start, end = cu_seqlens[i], cu_seqlens[i + 1] + new_lse[i, : end - start] = lse[start:end] + return new_lse.squeeze(dim=-1).transpose(1, 2).contiguous() + + +class RingComm: + def __init__(self, process_group: dist.ProcessGroup): + self._process_group = process_group + self._ops = [] + self.rank = dist.get_rank(self._process_group) + self.world_size = dist.get_world_size(self._process_group) + self._reqs = None + + self.send_rank = (self.rank + 1) % self.world_size + self.recv_rank = (self.rank - 1) % self.world_size + + if process_group is not None: + self.send_rank = dist.get_global_rank(self._process_group, self.send_rank) + self.recv_rank = dist.get_global_rank(self._process_group, self.recv_rank) + + def send_recv( + self, to_send: torch.Tensor, recv_tensor: Optional[torch.Tensor] = None + ) -> torch.Tensor: + if recv_tensor is None: + res = torch.empty_like(to_send) + # print(f"send_recv: empty_like {to_send.shape}") + else: + res = recv_tensor + + send_op = dist.P2POp( + dist.isend, to_send, self.send_rank, group=self._process_group + ) + recv_op = dist.P2POp(dist.irecv, res, self.recv_rank, group=self._process_group) + self._ops.append(send_op) + self._ops.append(recv_op) + return res + + def commit(self): + if self._reqs is not None: + raise RuntimeError("commit called twice") + self._reqs = dist.batch_isend_irecv(self._ops) + + def wait(self): + if self._reqs is None: + raise RuntimeError("wait called before commit") + for req in self._reqs: + req.wait() + self._reqs = None + self._ops = [] diff --git a/idea1/specforge/lr_scheduler.py b/idea1/specforge/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..9aeb241631c9d573ec8e15a1f7dc07f4bef4b609 --- /dev/null +++ b/idea1/specforge/lr_scheduler.py @@ -0,0 +1,271 @@ +from warnings import warn + +from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR +from torch.optim.lr_scheduler import LRScheduler as _LRScheduler + + +class _enable_get_lr_call: + def __init__(self, o): + self.o = o + + def __enter__(self): + self.o._get_lr_called_within_step = True + return self + + def __exit__(self, type, value, traceback): + self.o._get_lr_called_within_step = False + + +class TwoStageScheduler(_LRScheduler): + def __init__(self, optimizer, after_scheduler: _LRScheduler, last_epoch=-1): + self.after_scheduler = after_scheduler + self.finished = False + super().__init__(optimizer, last_epoch) + + def state_dict(self): + state_dict = { + key: value for key, value in self.__dict__.items() if key not in "optimizer" + } + if isinstance(state_dict["after_scheduler"], _LRScheduler): + state_dict["after_scheduler_type"] = type( + state_dict["after_scheduler"] + ).__name__ + state_dict["after_scheduler_dict"] = state_dict[ + "after_scheduler" + ].state_dict() + del state_dict["after_scheduler"] + else: + raise NotImplementedError() + return state_dict + + def load_state_dict(self, state_dict): + # Save _last_lr before it gets filtered out + last_lr = state_dict.get("_last_lr", None) + + if "after_scheduler_dict" not in state_dict: + warn( + "after_scheduler_dict is not found, skip loading after_scheduler. This may cause unexpected behavior." + ) + else: + self.after_scheduler.load_state_dict(state_dict["after_scheduler_dict"]) + state_dict = { + key: value + for key, value in state_dict.items() + if key not in ("after_scheduler_type", "after_scheduler_dict") + } + super().load_state_dict(state_dict) + + # Restore optimizer's lr from _last_lr to ensure consistency + # This is critical because PyTorch's CosineAnnealingLR.get_lr() uses + # group["lr"] to compute the next lr, but load_state_dict doesn't + # update the optimizer's lr automatically. + if last_lr is not None: + for param_group, lr in zip(self.optimizer.param_groups, last_lr): + param_group["lr"] = lr + + +class DelayerScheduler(TwoStageScheduler): + """Starts with a flat lr schedule until it reaches N epochs then applies + the specific scheduler (For example: ReduceLROnPlateau) + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler. + after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1): + if delay_epochs < 0: + raise ValueError(f"delay_epochs must >= 0, got {delay_epochs}") + self.delay_epochs = delay_epochs + super().__init__(optimizer, after_scheduler, last_epoch) + + def get_lr(self): + if self.last_epoch >= self.delay_epochs: + if not self.finished: + self.after_scheduler.base_lrs = self.base_lrs + self.finished = True + with _enable_get_lr_call(self.after_scheduler): + return self.after_scheduler.get_lr() + + return self.base_lrs + + def step(self, epoch=None): + if self.finished: + if epoch is None: + self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() + else: + self.after_scheduler.step(epoch - self.delay_epochs) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super(DelayerScheduler, self).step(epoch) + + +class WarmupScheduler(TwoStageScheduler): + """Starts with a linear warmup lr schedule until it reaches N epochs then applies + the specific scheduler (For example: ReduceLROnPlateau). + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler. + after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1): + self.warmup_epochs = int(warmup_epochs) + super().__init__(optimizer, after_scheduler, last_epoch) + + def get_lr(self): + if self.last_epoch >= self.warmup_epochs: + if not self.finished: + self.after_scheduler.base_lrs = self.base_lrs + self.finished = True + return self.after_scheduler.get_lr() + + return [(self.last_epoch + 1) / self.warmup_epochs * lr for lr in self.base_lrs] + + def step(self, epoch=None): + if self.finished: + if epoch is None: + self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() + else: + self.after_scheduler.step(epoch - self.warmup_epochs) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super().step(epoch) + + +class WarmupDelayerScheduler(TwoStageScheduler): + """Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule + until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau). + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler. + delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler. + after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__( + self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1 + ): + if delay_epochs < 0: + raise ValueError(f"delay_epochs must >= 0, got {delay_epochs}") + if warmup_epochs < 0: + raise ValueError(f"warmup_epochs must >= 0, got {warmup_epochs}") + self.warmup_epochs = warmup_epochs + self.delay_epochs = delay_epochs + super().__init__(optimizer, after_scheduler, last_epoch) + + def get_lr(self): + if self.last_epoch >= self.warmup_epochs + self.delay_epochs: + if not self.finished: + self.after_scheduler.base_lrs = self.base_lrs + # reset lr to base_lr + for group, base_lr in zip(self.optimizer.param_groups, self.base_lrs): + group["lr"] = base_lr + self.finished = True + with _enable_get_lr_call(self.after_scheduler): + return self.after_scheduler.get_lr() + elif self.last_epoch >= self.warmup_epochs: + return self.base_lrs + + return [(self.last_epoch + 1) / self.warmup_epochs * lr for lr in self.base_lrs] + + def step(self, epoch=None): + if self.finished: + if epoch is None: + self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() + else: + self.after_scheduler.step(epoch - self.warmup_epochs) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super().step(epoch) + + +class CosineAnnealingLR(_CosineAnnealingLR): + r"""Set the learning rate of each parameter group using a cosine annealing + schedule, where :math:`\eta_{max}` is set to the initial lr and + :math:`T_{cur}` is the number of epochs since the last restart in SGDR: + + .. math:: + \begin{aligned} + \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 + + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right), + & T_{cur} \neq (2k+1)T_{max}; \\ + \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min}) + \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right), + & T_{cur} = (2k+1)T_{max}. + \end{aligned} + + When last_epoch=-1, sets initial lr as lr. Notice that because the schedule + is defined recursively, the learning rate can be simultaneously modified + outside this scheduler by other operators. If the learning rate is set + solely by this scheduler, the learning rate at each step becomes: + + .. math:: + \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 + + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right) + + It has been proposed in + `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only + implements the cosine annealing part of SGDR, and not the restarts. + + .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: + https://arxiv.org/abs/1608.03983 + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + total_steps (int): Number of total training steps. + eta_min (int, optional): Minimum learning rate, defaults to 0. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__( + self, + optimizer, + total_steps: int, + eta_min: int = 0, + last_epoch: int = -1, + **kwargs, + ): + super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch) + + +class CosineAnnealingWarmupLR(WarmupScheduler): + """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied. + + Args: + optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. + total_steps (int): Number of total training steps. + warmup_steps (int, optional): Number of warmup steps, defaults to 0. + eta_min (int, optional): Minimum learning rate, defaults to 0. + last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1, + the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr. + """ + + def __init__( + self, + optimizer, + total_steps: int, + warmup_steps: int = 0, + eta_min: float = 0.0, + last_epoch: int = -1, + ): + base_scheduler = _CosineAnnealingLR( + optimizer, + total_steps - warmup_steps, + eta_min=eta_min, + last_epoch=last_epoch, + ) + super().__init__(optimizer, warmup_steps, base_scheduler, last_epoch=last_epoch) diff --git a/idea1/specforge/modeling/__init__.py b/idea1/specforge/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..09999d60bc39243219b2c346154fe51ff4594dce --- /dev/null +++ b/idea1/specforge/modeling/__init__.py @@ -0,0 +1,19 @@ +# from .auto import AutoDistributedTargetModel, AutoDraftModelConfig, AutoEagle3DraftModel +from .auto import AutoDraftModelConfig, AutoEagle3DraftModel +from .draft.llama3_eagle import LlamaForCausalLMEagle3 +from .target.eagle3_target_model import ( + CustomEagle3TargetModel, + HFEagle3TargetModel, + SGLangEagle3TargetModel, + get_eagle3_target_model, +) + +__all__ = [ + "LlamaForCausalLMEagle3", + "SGLangEagle3TargetModel", + "HFEagle3TargetModel", + "CustomEagle3TargetModel", + "get_eagle3_target_model", + "AutoDraftModelConfig", + "AutoEagle3DraftModel", +] diff --git a/idea1/specforge/modeling/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/modeling/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ade4adaecdf056aaeeb1f46b105cf690c11107df Binary files /dev/null and b/idea1/specforge/modeling/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/__pycache__/__init__.cpython-313.pyc b/idea1/specforge/modeling/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eeb42361a4e5078e9254153bc3c8ab6290f8f543 Binary files /dev/null and b/idea1/specforge/modeling/__pycache__/__init__.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/__pycache__/_mask_utils.cpython-311.pyc b/idea1/specforge/modeling/__pycache__/_mask_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b8adfbd039a3d7ae3c2832dfbc5699b35d515f5 Binary files /dev/null and b/idea1/specforge/modeling/__pycache__/_mask_utils.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/__pycache__/_mask_utils.cpython-313.pyc b/idea1/specforge/modeling/__pycache__/_mask_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c367a4cf5422e9c0e50ea31795e6669d286c0ebd Binary files /dev/null and b/idea1/specforge/modeling/__pycache__/_mask_utils.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/__pycache__/auto.cpython-311.pyc b/idea1/specforge/modeling/__pycache__/auto.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1343b662f2d9090ca7c145cea2cd0bd8bc8a1363 Binary files /dev/null and b/idea1/specforge/modeling/__pycache__/auto.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/__pycache__/auto.cpython-313.pyc b/idea1/specforge/modeling/__pycache__/auto.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d80c30f4fbdbd5fb9f0641488c19fe41f5bc4c72 Binary files /dev/null and b/idea1/specforge/modeling/__pycache__/auto.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/_mask_utils.py b/idea1/specforge/modeling/_mask_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bedb200299e24ecf531c117618e55837c49facbe --- /dev/null +++ b/idea1/specforge/modeling/_mask_utils.py @@ -0,0 +1,73 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in HuggingFace Transformers. +# Portions of this code are adapted from: +# - https://github.com/EleutherAI/gpt-neox (Apache License 2.0) +# - https://github.com/huggingface/transformers (Apache License 2.0) +# - https://github.com/SafeAILab/EAGLE (Apache License 2.0) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat( + [ + torch.zeros( + tgt_len, past_key_values_length, dtype=dtype, device=device + ), + mask, + ], + dim=-1, + ) + return mask[None, None, :, :].expand( + bsz, 1, tgt_len, tgt_len + past_key_values_length + ) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) diff --git a/idea1/specforge/modeling/auto.py b/idea1/specforge/modeling/auto.py new file mode 100644 index 0000000000000000000000000000000000000000..1e48a43e7a62748f802671500b23adf74c6dd03a --- /dev/null +++ b/idea1/specforge/modeling/auto.py @@ -0,0 +1,175 @@ +import json +import os +from typing import Optional, Union + +import torch +from transformers import AutoConfig +from transformers import AutoModelForCausalLM as AutoModelForCausalLMBase +from transformers import ( + GptOssConfig, + Llama4Config, + Llama4TextConfig, + LlamaConfig, + Phi3Config, + PretrainedConfig, + Qwen2Config, + Qwen3Config, + Qwen3MoeConfig, + modeling_utils, +) + +from .draft.llama3_eagle import LlamaForCausalLMEagle3 +from .target.custom_backend import ( + GptOssForCausalLM, + Llama4ForCausalLM, + LlamaForCausalLM, + Phi3ForCausalLM, + Qwen2ForCausalLM, + Qwen3ForCausalLM, + Qwen3MoeForCausalLM, +) + + +class AutoEagle3DraftModel(AutoModelForCausalLMBase): + # the model mapping is currently hardcoded, we should support lazy model mapping via registry + _model_mapping = { + LlamaConfig: LlamaForCausalLMEagle3, + } + + @classmethod + def from_config(cls, config: PretrainedConfig, torch_dtype=None, **config_kwargs): + """ + This class method takes a configuration object and create its model based on the + _model_mapping class variable. + + Args: + config (PretrainedConfig): A configuration object. + + Returns: + A model instance. + """ + # get the model class from the + _model_cls = cls._model_mapping[type(config)] + model = _model_cls(config, **config_kwargs) + + # Convert model to specified dtype if provided + if torch_dtype is not None: + model = model.to(dtype=torch_dtype) + return model + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike[str]], + *model_args, + **kwargs, + ): + original_warn = modeling_utils.logger.warning + + def filtered_warning(msg): + if "embed_tokens.weight" in str(msg) and "initialized" in str(msg): + return + original_warn(msg) + + modeling_utils.logger.warning = filtered_warning + + try: + model = super().from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + finally: + modeling_utils.logger.warning = original_warn + + return model + + +class AutoDistributedTargetModel(AutoModelForCausalLMBase): + # the model mapping is currently hardcoded, we should support lazy model mapping via registry + _model_mapping = { + Llama4TextConfig: [Llama4ForCausalLM], + Qwen3MoeConfig: [Qwen3MoeForCausalLM], + Qwen2Config: [Qwen2ForCausalLM], + LlamaConfig: [LlamaForCausalLM], + Qwen3Config: [Qwen3ForCausalLM], + Phi3Config: [Phi3ForCausalLM], + GptOssConfig: [GptOssForCausalLM], + } + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike[str]], + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **config_kwargs, + ): + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + ) + + if isinstance(config, Llama4Config): + config = config.text_config + + assert ( + type(config) in cls._model_mapping + ), f"Unsupported config type: {type(config)}" + model_cls = cls._model_mapping[type(config)][0] + model = model_cls.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + cache_dir=cache_dir, + **config_kwargs, + ) + + if device is not None: + model = model.to(device) + else: + model = model.cuda() + return model + + +class AutoDraftModelConfig: + + _config_mapping = { + "LlamaForCausalLMEagle3": LlamaConfig, + } + + @classmethod + def from_file(cls, config_path: str): + """ + This class method takes a configuration file path and create its configuration object based on the + _config_mapping class variable. + + Args: + config_path (str): A path to a configuration file. + + Returns: + A configuration object. + """ + with open(config_path, "r") as f: + config = json.load(f) + + if "tie_word_embeddings" in config: + print("Set draft model tie_word_embeddings to False") + config["tie_word_embeddings"] = False + + # check for architectures + architectures = config.get("architectures", None) + + if architectures is None: + raise ValueError("No architectures found in the config file") + + if len(architectures) != 1: + raise ValueError("Only one architecture is supported") + + architecture = architectures[0] + + if architecture not in cls._config_mapping: + raise ValueError(f"Architecture {architecture} not supported") + + # If draft_vocab_size is not in config or is None, set draft_vocab_size to vocab_size + if "draft_vocab_size" not in config or config["draft_vocab_size"] is None: + config["draft_vocab_size"] = config.get("vocab_size", None) + + return cls._config_mapping[architecture].from_dict(config) diff --git a/idea1/specforge/modeling/draft/__init__.py b/idea1/specforge/modeling/draft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8bdc7e2f6fa02407dc4e4bab9c4b1e252c10aa62 --- /dev/null +++ b/idea1/specforge/modeling/draft/__init__.py @@ -0,0 +1,17 @@ +from .base import Eagle3DraftModel +from .dflash import ( + DFlashDraftModel, + build_target_layer_ids, + extract_context_feature, + sample, +) +from .llama3_eagle import LlamaForCausalLMEagle3 + +__all__ = [ + "Eagle3DraftModel", + "DFlashDraftModel", + "LlamaForCausalLMEagle3", + "build_target_layer_ids", + "extract_context_feature", + "sample", +] diff --git a/idea1/specforge/modeling/draft/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/modeling/draft/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57047b8ed57243e1c4b8e43f1ff71e93354e20c4 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/__init__.cpython-313.pyc b/idea1/specforge/modeling/draft/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81b391da7d196e2f99c220b49116a104ba0e0df1 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/__init__.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/base.cpython-311.pyc b/idea1/specforge/modeling/draft/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9d26a5272e06fb80810d001fb88fdd7fdac9c06 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/base.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/base.cpython-313.pyc b/idea1/specforge/modeling/draft/__pycache__/base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a539327d39a72eb152fe0e0119c7a937a50604f0 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/base.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/dflash.cpython-311.pyc b/idea1/specforge/modeling/draft/__pycache__/dflash.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98e80258c717000369b3ccaf88dbb8c73099d510 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/dflash.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/dflash.cpython-313.pyc b/idea1/specforge/modeling/draft/__pycache__/dflash.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68379eda84d58f4d14bee4b289f4d7ab0ba39c54 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/dflash.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/flex_attention.cpython-311.pyc b/idea1/specforge/modeling/draft/__pycache__/flex_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..894c1ab830edc1c259f10ad861cefb94eba60cd7 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/flex_attention.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/llama3_eagle.cpython-311.pyc b/idea1/specforge/modeling/draft/__pycache__/llama3_eagle.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f961925629e55e76f9874de5eb9ee4086b55e845 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/llama3_eagle.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/draft/__pycache__/llama3_eagle.cpython-313.pyc b/idea1/specforge/modeling/draft/__pycache__/llama3_eagle.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ad6c9194277750af7b1cd5346602dd4a1b4d349 Binary files /dev/null and b/idea1/specforge/modeling/draft/__pycache__/llama3_eagle.cpython-313.pyc differ diff --git a/idea1/specforge/modeling/draft/base.py b/idea1/specforge/modeling/draft/base.py new file mode 100644 index 0000000000000000000000000000000000000000..b5584a759d78a072903e0e76999b1674a62f0a88 --- /dev/null +++ b/idea1/specforge/modeling/draft/base.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in HuggingFace Transformers. +# Portions of this code are adapted from: +# - https://github.com/EleutherAI/gpt-neox (Apache License 2.0) +# - https://github.com/huggingface/transformers (Apache License 2.0) +# - https://github.com/SafeAILab/EAGLE (Apache License 2.0) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +from abc import ABC, abstractmethod +from typing import Optional + +import torch +from huggingface_hub import snapshot_download +from safetensors import safe_open +from transformers.cache_utils import Cache +from transformers.modeling_utils import PreTrainedModel + +from specforge.modeling._mask_utils import _expand_mask, _make_causal_mask + + +class Eagle3DraftModel(PreTrainedModel, ABC): + """ + This is the base class for the Eagle3 draft model implementation. The child class needs to implement + the abstract methods to support training with TTT. + """ + + @abstractmethod + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + """ + Embed the input ids. + """ + + @abstractmethod + def project_hidden_states(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Project the concatenated hidden states from the high, medium and low layers to the target hidden size. + """ + + @abstractmethod + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Compute the logits of the draft model. + """ + + def prepare_decoder_attention_mask( + self, + attention_mask: torch.Tensor, + hidden_states: torch.Tensor, + batch_size: int, + seq_length: int, + past_key_values_length: int, + ) -> torch.Tensor: + """ + Prepare the attention mask of the draft model. + """ + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if seq_length > 1: + combined_attention_mask = _make_causal_mask( + (batch_size, seq_length), + hidden_states.dtype, + device=hidden_states.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, hidden_states.dtype, tgt_len=seq_length + ).to(hidden_states.device) + combined_attention_mask = ( + expanded_attn_mask + if combined_attention_mask is None + else expanded_attn_mask + combined_attention_mask + ) + return combined_attention_mask + + @abstractmethod + def backbone( + self, + input_embeds: torch.Tensor, + hidden_states: torch.Tensor, + cache_hidden: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + past_key_values: Optional[Cache] = None, + use_cache: bool = True, + ) -> torch.Tensor: + """ + The backbone of the draft model. + """ + + def freeze_embedding(self) -> None: + """ + Freeze the embeddings of the draft model so that they are not updated during training. + """ + self.embed_tokens.weight.requires_grad = False + + @torch.no_grad() + def load_embedding( + self, model_path: str, embedding_key: str = "model.embed_tokens.weight" + ) -> None: + """ + Load the embedding of the draft model. + + Args: + model_path (str): Path to the target model. Can be either a Hugging Face + repository ID or a local directory path containing the model files. + """ + if os.path.exists(model_path): + # model_path is a local directory + # check if there is file ending with index.json + glob_path = os.path.join(model_path, "*.index.json") + index_json_path = glob.glob(glob_path) + + if len(index_json_path) == 0: + # No index.json found, look for single model file + safetensors_path = os.path.join(model_path, "model.safetensors") + if os.path.exists(safetensors_path): + with safe_open(safetensors_path, framework="pt") as f: + self.embed_tokens.weight.copy_(f.get_tensor(embedding_key)) + return + + pytorch_model_path = os.path.join(model_path, "pytorch_model.bin") + if os.path.exists(pytorch_model_path): + state_dict = torch.load(pytorch_model_path, map_location="cpu") + self.embed_tokens.weight.copy_(state_dict[embedding_key]) + return + + raise FileNotFoundError( + f"No index.json, model.safetensors or pytorch_model.bin found in {model_path}" + ) + if len(index_json_path) > 1: + raise FileNotFoundError( + f"Multiple index.json files found in {model_path}" + ) + index_json_path = index_json_path[0] + + with open(index_json_path, "r") as f: + index_json = json.load(f) + ckpt_file = index_json["weight_map"][embedding_key] + + if ckpt_file.endswith(".safetensors"): + with safe_open( + os.path.join(model_path, ckpt_file), framework="pt" + ) as f: + emb_tokens = f.get_tensor(embedding_key) + else: + state_dict = torch.load(os.path.join(model_path, ckpt_file)) + emb_tokens = state_dict[embedding_key] + self.embed_tokens.weight.copy_(emb_tokens) + else: + # this is the case where model_path is a huggingface repository + # we first need to locate its local cache + local_cache_path = snapshot_download(repo_id=model_path) + self.load_embedding(local_cache_path, embedding_key) + + def load_vocab_mapping(self, file_path: str) -> None: + """ + Load the vocab buffers of the draft model. + + Args: + file_path (str): The path to the vocab mapping file. + """ + assert hasattr(self, "t2d") and hasattr( + self, "d2t" + ), "t2d and d2t buffersare not found in the draft model, please check your draft model implementation" + vocab_mapping = torch.load(file_path) + self.t2d.copy_(vocab_mapping["t2d"]) + self.d2t.copy_(vocab_mapping["d2t"]) + self.vocab_mapping_loaded = True diff --git a/idea1/specforge/modeling/draft/dflash.py b/idea1/specforge/modeling/draft/dflash.py new file mode 100644 index 0000000000000000000000000000000000000000..d8ccd647f19360e7f28e42d011bd261e010d4c68 --- /dev/null +++ b/idea1/specforge/modeling/draft/dflash.py @@ -0,0 +1,512 @@ +from typing import Callable, Optional + +import torch +from torch import nn +from transformers import DynamicCache +from transformers.cache_utils import Cache +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.models.qwen3.modeling_qwen3 import ( + ALL_ATTENTION_FUNCTIONS, + FlashAttentionKwargs, + GradientCheckpointingLayer, + Qwen3Config, + Qwen3MLP, + Qwen3PreTrainedModel, + Qwen3RMSNorm, + Qwen3RotaryEmbedding, + eager_attention_forward, + rotate_half, +) +from typing_extensions import Tuple, Unpack + + +def sample(logits: torch.Tensor, temperature: float = 0.0) -> torch.Tensor: + if temperature < 1e-5: + return torch.argmax(logits, dim=-1) + bsz, seq_len, vocab_size = logits.shape + logits = logits.view(-1, vocab_size) + logits = logits / temperature + probs = torch.softmax(logits, dim=-1) + return torch.multinomial(probs, num_samples=1).view(bsz, seq_len) + + +def apply_rotary_pos_emb( + q, + k, + q_cos, + q_sin, + k_cos=None, + k_sin=None, + position_ids=None, + unsqueeze_dim=1, +): + q_cos = q_cos.unsqueeze(unsqueeze_dim) + q_sin = q_sin.unsqueeze(unsqueeze_dim) + if k_cos is None: + k_cos = q_cos + k_sin = q_sin + else: + k_cos = k_cos.unsqueeze(unsqueeze_dim) + k_sin = k_sin.unsqueeze(unsqueeze_dim) + + q_len = q.size(-2) + q_embed = (q * q_cos[..., -q_len:, :]) + ( + rotate_half(q) * q_sin[..., -q_len:, :] + ) + k_embed = (k * k_cos) + (rotate_half(k) * k_sin) + return q_embed, k_embed + + +class Qwen3DFlashAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Qwen3Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = False + self.q_proj = nn.Linear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=config.attention_bias, + ) + self.k_proj = nn.Linear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.v_proj = nn.Linear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.sliding_window = ( + config.sliding_window + if config.layer_types[layer_idx] == "sliding_attention" + else None + ) + + def forward( + self, + hidden_states: torch.Tensor, + target_hidden: torch.Tensor, + position_embeddings: tuple[ + tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor] + ], + attention_mask: Optional[torch.Tensor], + kv_hidden_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + update_kv_cache: bool = True, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + bsz, q_len = hidden_states.shape[:-1] + ctx_len = target_hidden.shape[1] + if kv_hidden_states is None: + kv_hidden_states = hidden_states + q = self.q_proj(hidden_states) + q = q.view(bsz, q_len, -1, self.head_dim) + q = self.q_norm(q).transpose(1, 2) + k_ctx = self.k_proj(target_hidden) + k_noise = self.k_proj(kv_hidden_states) + v_ctx = self.v_proj(target_hidden) + v_noise = self.v_proj(kv_hidden_states) + k = torch.cat([k_ctx, k_noise], dim=1).view( + bsz, ctx_len + q_len, -1, self.head_dim + ) + v = torch.cat([v_ctx, v_noise], dim=1).view( + bsz, ctx_len + q_len, -1, self.head_dim + ) + k = self.k_norm(k).transpose(1, 2) + v = v.transpose(1, 2) + (q_cos, q_sin), (k_cos, k_sin) = position_embeddings + q, k = apply_rotary_pos_emb(q, k, q_cos, q_sin, k_cos, k_sin) + if past_key_values is not None: + if update_kv_cache: + cache_kwargs = { + "sin": k_sin, + "cos": k_cos, + "cache_position": cache_position, + } + k, v = past_key_values.update(k, v, self.layer_idx, cache_kwargs) + elif self.layer_idx < len(past_key_values.layers): + cache_layer = past_key_values.layers[self.layer_idx] + if cache_layer.get_seq_length() > 0: + k = torch.cat([cache_layer.keys, k], dim=-2) + v = torch.cat([cache_layer.values, v], dim=-2) + attn_fn: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attn_fn = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attn_output, attn_weights = attn_fn( + self, + q, + k, + v, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, + **kwargs, + ) + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Qwen3DFlashDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Qwen3Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Qwen3DFlashAttention(config=config, layer_idx=layer_idx) + self.mlp = Qwen3MLP(config) + self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen3RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + target_hidden: Optional[torch.Tensor] = None, + hidden_states: Optional[torch.Tensor] = None, + kv_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + update_kv_cache: bool = True, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + if kv_hidden_states is None: + kv_hidden_states = hidden_states + else: + kv_hidden_states = self.input_layernorm(kv_hidden_states) + hidden_states = self.self_attn( + hidden_states=hidden_states, + target_hidden=target_hidden, + kv_hidden_states=kv_hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + update_kv_cache=update_kv_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + )[0] + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +def build_target_layer_ids(num_target_layers: int, num_draft_layers: int): + if num_draft_layers == 1: + return [(num_target_layers // 2)] + start = 1 + end = num_target_layers - 3 + span = end - start + target_layer_ids = [ + int(round(start + (i * span) / (num_draft_layers - 1))) + for i in range(num_draft_layers) + ] + return target_layer_ids + + +def extract_context_feature( + hidden_states: list[torch.Tensor], + layer_ids: Optional[list[int]], +) -> torch.Tensor: + offset = 1 + selected_states = [] + for layer_id in layer_ids: + selected_states.append(hidden_states[layer_id + offset]) + target_hidden = torch.cat(selected_states, dim=-1) + return target_hidden + + +class DFlashDraftModel(Qwen3PreTrainedModel): + config_class = Qwen3Config + _no_split_modules = ["Qwen3DFlashDecoderLayer"] + + def __init__(self, config) -> None: + super().__init__(config) + self.config = config + self.layers = nn.ModuleList( + [ + Qwen3DFlashDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + dflash_config = getattr(config, "dflash_config", {}) or {} + self.target_layer_ids = dflash_config.get( + "target_layer_ids", + build_target_layer_ids(config.num_target_layers, config.num_hidden_layers), + ) + self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Qwen3RotaryEmbedding(config) + self.fc = nn.Linear( + len(self.target_layer_ids) * config.hidden_size, + config.hidden_size, + bias=False, + ) + self.hidden_norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.block_size = config.block_size + self.mask_token_id = dflash_config.get("mask_token_id", None) + self.post_init() + + def _resolve_position_ids( + self, + position_ids: Optional[torch.LongTensor], + noise_position_ids: Optional[torch.LongTensor], + kv_position_ids: Optional[torch.LongTensor], + noise_len: int, + ctx_len: int, + ) -> tuple[torch.LongTensor, torch.LongTensor]: + if position_ids is not None: + if kv_position_ids is None: + kv_position_ids = position_ids + if noise_position_ids is None: + noise_position_ids = position_ids[:, -noise_len:] + + if noise_position_ids is None: + raise ValueError("DFlash forward requires noise_position_ids or position_ids.") + if kv_position_ids is None: + if ctx_len == 0: + kv_position_ids = noise_position_ids + else: + raise ValueError( + "DFlash forward requires kv_position_ids for context+noise attention." + ) + + expected_kv_len = ctx_len + noise_len + if noise_position_ids.shape[1] != noise_len: + raise ValueError( + f"noise_position_ids length {noise_position_ids.shape[1]} does not match noise length {noise_len}." + ) + if kv_position_ids.shape[1] != expected_kv_len: + raise ValueError( + f"kv_position_ids length {kv_position_ids.shape[1]} does not match expected KV length {expected_kv_len}." + ) + return noise_position_ids, kv_position_ids + + def forward( + self, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + noise_embedding: Optional[torch.Tensor] = None, + kv_noise_embedding: Optional[torch.Tensor] = None, + target_hidden: Optional[torch.Tensor] = None, + noise_position_ids: Optional[torch.LongTensor] = None, + kv_position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + **kwargs, + ) -> CausalLMOutputWithPast: + hidden_states = noise_embedding + kv_hidden_states = kv_noise_embedding + target_hidden = self.hidden_norm(self.fc(target_hidden)) + noise_position_ids, kv_position_ids = self._resolve_position_ids( + position_ids=position_ids, + noise_position_ids=noise_position_ids, + kv_position_ids=kv_position_ids, + noise_len=hidden_states.shape[1], + ctx_len=target_hidden.shape[1], + ) + position_embeddings = ( + self.rotary_emb(hidden_states, noise_position_ids), + self.rotary_emb(hidden_states, kv_position_ids), + ) + for layer in self.layers: + if kv_hidden_states is None: + hidden_states = layer( + hidden_states=hidden_states, + target_hidden=target_hidden, + attention_mask=attention_mask, + position_ids=kv_position_ids, + past_key_value=past_key_values, + use_cache=use_cache, + update_kv_cache=use_cache, + position_embeddings=position_embeddings, + **kwargs, + ) + else: + hidden_states = layer( + hidden_states=hidden_states, + target_hidden=target_hidden, + kv_hidden_states=kv_hidden_states, + attention_mask=attention_mask, + position_ids=kv_position_ids, + past_key_value=past_key_values, + use_cache=use_cache, + update_kv_cache=False, + position_embeddings=position_embeddings, + **kwargs, + ) + kv_hidden_states = layer( + hidden_states=kv_hidden_states, + target_hidden=target_hidden, + attention_mask=attention_mask, + position_ids=kv_position_ids, + past_key_value=past_key_values, + use_cache=use_cache, + update_kv_cache=use_cache, + position_embeddings=position_embeddings, + **kwargs, + ) + return self.norm(hidden_states) + + @torch.inference_mode() + def spec_generate( + self, + target: nn.Module, + input_ids: torch.LongTensor, + max_new_tokens: int, + stop_token_ids: list[int], + temperature: float, + num_denoise_steps: int = 1, + ): + self.eval() + num_input_tokens = input_ids.shape[1] + max_length = num_input_tokens + max_new_tokens + + block_size = self.block_size + output_ids = torch.full( + (1, max_length + block_size), + self.mask_token_id, + dtype=torch.long, + device=target.device, + ) + position_ids = torch.arange( + output_ids.shape[1], device=target.device + ).unsqueeze(0) + + past_key_values_target = DynamicCache() + past_key_values_draft = DynamicCache() + + # Prefill stage + output = target( + input_ids, + position_ids=position_ids[:, :num_input_tokens], + past_key_values=past_key_values_target, + use_cache=True, + logits_to_keep=1, + output_hidden_states=True, + ) + + output_ids[:, :num_input_tokens] = input_ids + output_ids[:, num_input_tokens : num_input_tokens + 1] = sample( + output.logits, temperature + ) + target_hidden = extract_context_feature( + output.hidden_states, self.target_layer_ids + ) + + # Decode stage + acceptance_lengths = [] + start = input_ids.shape[1] + while start < max_length: + block_output_ids = output_ids[:, start : start + block_size].clone() + block_position_ids = position_ids[:, start : start + block_size] + draft_cache_prefix_len = past_key_values_draft.get_seq_length() + draft_kv_position_ids = position_ids[ + :, draft_cache_prefix_len : start + block_size + ] + mask_noise_embedding = target.model.embed_tokens(block_output_ids) + + # Multi-step denoising loop + for denoise_step in range(num_denoise_steps): + noise_embedding = mask_noise_embedding + if denoise_step > 0: + pred_noise_embedding = target.model.embed_tokens(block_output_ids) + mix_weight = denoise_step / num_denoise_steps + noise_embedding = torch.lerp( + mask_noise_embedding, pred_noise_embedding, mix_weight + ) + draft_hidden = self( + target_hidden=target_hidden, + noise_embedding=noise_embedding, + noise_position_ids=block_position_ids, + kv_position_ids=draft_kv_position_ids, + past_key_values=past_key_values_draft, + use_cache=True, + is_causal=False, + )[:, -block_size + 1 :, :] + draft_logits = target.lm_head(draft_hidden) + block_output_ids[:, 1:] = sample(draft_logits) + if denoise_step + 1 < num_denoise_steps: + # Reuse the accepted-prefix cache, but rebuild the current block on the next denoise step. + past_key_values_draft.crop(draft_cache_prefix_len) + past_key_values_draft.crop(start) + + output = target( + block_output_ids, + position_ids=block_position_ids, + past_key_values=past_key_values_target, + use_cache=True, + output_hidden_states=True, + ) + + posterior = sample(output.logits, temperature) + acceptance_length = ( + (block_output_ids[:, 1:] == posterior[:, :-1]) + .cumprod(dim=1) + .sum(dim=1)[0] + .item() + ) + output_ids[:, start : start + acceptance_length + 1] = block_output_ids[ + :, : acceptance_length + 1 + ] + output_ids[:, start + acceptance_length + 1] = posterior[ + :, acceptance_length + ] + start += acceptance_length + 1 + past_key_values_target.crop(start) + target_hidden = extract_context_feature( + output.hidden_states, self.target_layer_ids + )[:, : acceptance_length + 1, :] + acceptance_lengths.append(acceptance_length + 1) + if stop_token_ids is not None and any( + stop_token_id in output_ids[:, num_input_tokens:] + for stop_token_id in stop_token_ids + ): + break + output_ids = output_ids[:, :max_length] + output_ids = output_ids[:, output_ids[0] != self.mask_token_id] + if stop_token_ids is not None: + stop_token_ids = torch.tensor(stop_token_ids, device=output_ids.device) + stop_token_indices = torch.isin( + output_ids[0][num_input_tokens:], stop_token_ids + ).nonzero(as_tuple=True)[0] + if stop_token_indices.numel() > 0: + output_ids = output_ids[ + :, : num_input_tokens + stop_token_indices[0] + 1 + ] + + return output_ids, acceptance_lengths diff --git a/idea1/specforge/modeling/draft/flex_attention.py b/idea1/specforge/modeling/draft/flex_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..50ca5f54dc658106c22d7a8a95553bf346b33525 --- /dev/null +++ b/idea1/specforge/modeling/draft/flex_attention.py @@ -0,0 +1,127 @@ +import torch +import torch._dynamo as dynamo +from torch.nn.attention.flex_attention import ( + create_block_mask, + flex_attention, + or_masks, +) +from transformers.utils import is_torchdynamo_compiling + +dynamo.config.recompile_limit = 64 + + +# Reference Implementation https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/flex_attention.py +class WrappedFlexAttention: + """ + We are doing a singleton class so that flex attention is compiled once when it's first called. + """ + + _instance = None + _is_flex_compiled = False + _compiled_flex_attention = None + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + # Create a new instance if one doesn't already exist + cls._instance = super().__new__(cls) + return cls._instance + + @torch.compiler.disable(recursive=False) + def __init__(self): + """ + Initialize or update the singleton instance. + """ + if not self._is_flex_compiled: + # Enable dynamic shapes to handle different input sizes + self._compiled_flex_attention = torch.compile( + flex_attention, + # mode="max-autotune-no-cudagraphs", + ) + self._is_flex_compiled = True + + def __call__(self): + return self._compiled_flex_attention + + +def compile_friendly_flex_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + **kwargs, +) -> torch.Tensor: + # First call initialise singleton wrapper object, second call invokes the object method to return compiled flex attention + # Do not use compiled version if already compiling forward (it raises issues) + flex_attention_compiled = ( + WrappedFlexAttention()() if not is_torchdynamo_compiling() else flex_attention + ) + return flex_attention_compiled( + query, + key, + value, + **kwargs, + ) + + +class WrappedCreateBlockMask: + _instance = None + _is_create_block_mask_compiled = False + _compiled_create_block_mask = None + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + @torch.compiler.disable(recursive=False) + def __init__(self): + if not self._is_create_block_mask_compiled: + self._compiled_create_block_mask = torch.compile(create_block_mask) + self._is_create_block_mask_compiled = True + + def __call__(self): + return self._compiled_create_block_mask + + +def compile_friendly_create_block_mask( + mask_mod, + B, + H, + Q_LEN, + KV_LEN, + device, +): + create_block_mask_compiled = ( + WrappedCreateBlockMask()() + if not is_torchdynamo_compiling() + else create_block_mask + ) + return create_block_mask_compiled( + mask_mod, + B, + H, + Q_LEN, + KV_LEN, + device, + ) + + +def generate_eagle3_mask( + seq_lengths: torch.Tensor, Q_LEN: int, KV_LEN: int, lck: int = 0 +): + + def causal_mask(b, h, q_idx, kv_idx): + # Causal will keep shrinking by 1 diagnol due to appended suffix + # Shirnk the causal by diagnol + causal_mask = q_idx >= kv_idx + padding_mask = (kv_idx < seq_lengths[b]) & (q_idx < seq_lengths[b]) + return causal_mask & padding_mask + + def suffix_mask(b, h, q_idx, kv_idx): + suffix_mask = kv_idx >= Q_LEN + padding_mask = kv_idx % Q_LEN < seq_lengths[b] + diagnol_mask = (kv_idx - q_idx) % Q_LEN == 0 + return suffix_mask & padding_mask & diagnol_mask + + mask_mod = or_masks(causal_mask, suffix_mask) + mask_mod.__name__ = f"eagle3_mask_Q_{Q_LEN}_KV_{KV_LEN}_lck_{lck}" + return mask_mod diff --git a/idea1/specforge/modeling/draft/llama3_eagle.py b/idea1/specforge/modeling/draft/llama3_eagle.py new file mode 100644 index 0000000000000000000000000000000000000000..268142c0c64e8c35336892c56bc5f52102ad800d --- /dev/null +++ b/idea1/specforge/modeling/draft/llama3_eagle.py @@ -0,0 +1,1436 @@ +import math +import warnings +from typing import List, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.attention.flex_attention import create_block_mask, flex_attention +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache +from transformers.models.llama.configuration_llama import LlamaConfig +from yunchang.comm import SeqAllToAll4D + +from specforge.modeling.draft.flex_attention import ( + compile_friendly_create_block_mask, + compile_friendly_flex_attention, + generate_eagle3_mask, +) +from specforge.utils import print_with_rank + +from ...distributed import get_sp_ring_group, get_sp_ulysses_group +from ...layers.ring import ring_flash_attn_func +from .base import Eagle3DraftModel + +try: + from flash_attn import flash_attn_func +except ImportError: + warnings.warn( + "flash_attn is not found, falling back to flex_attention. " + "Please install flash_attn if you want to use the flash attention backend." + ) + flash_attn_func = None + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat( + [ + torch.zeros( + tgt_len, past_key_values_length, dtype=dtype, device=device + ), + mask, + ], + dim=-1, + ) + return mask[None, None, :, :].expand( + bsz, 1, tgt_len, tgt_len + past_key_values_length + ) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand( + batch, num_key_value_heads, n_rep, slen, head_dim + ) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +@torch.compile(dynamic=True) +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1): + """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/). + + Explanation: + Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding + sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For + vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately. + Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. + For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, + height and width) of text embedding is always the same, so the text embedding rotary position embedding has no + difference with modern LLMs. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + mrope_section(`List(int)`): + Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + mrope_section = mrope_section * 2 + cos = torch.cat( + [m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1 + ).unsqueeze(unsqueeze_dim) + sin = torch.cat( + [m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1 + ).unsqueeze(unsqueeze_dim) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def prepare_decoder_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length +): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ).to(inputs_embeds.device) + combined_attention_mask = ( + expanded_attn_mask + if combined_attention_mask is None + else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=None, + low_freq_factor=None, + high_freq_factor=None, + orig_max_position=None, + ): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + # Llama3 style rotary embedding frequency scaling + if all( + v is not None + for v in [ + scaling_factor, + low_freq_factor, + high_freq_factor, + orig_max_position, + ] + ): + print_with_rank( + f"Using Llama3 style rotary embedding with scaling_factor={scaling_factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, orig_max_position={orig_max_position}" + ) + self.scaling_factor = scaling_factor + self.low_freq_factor = low_freq_factor + self.high_freq_factor = high_freq_factor + self.orig_max_position = orig_max_position + + low_freq_wavelen = orig_max_position / low_freq_factor + high_freq_wavelen = orig_max_position / high_freq_factor + wave_len = 2 * math.pi / inv_freq + + if low_freq_factor != high_freq_factor: + smooth = (orig_max_position / wave_len - low_freq_factor) / ( + high_freq_factor - low_freq_factor + ) + else: + smooth = 0 + + new_freqs = torch.where( + wave_len < high_freq_wavelen, + inv_freq, + torch.where( + wave_len > low_freq_wavelen, + inv_freq / self.scaling_factor, + (1 - smooth) * inv_freq / self.scaling_factor + smooth * inv_freq, + ), + ) + inv_freq = new_freqs + + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings + 20, + device=self.inv_freq.device, + dtype=torch.get_default_dtype(), + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False + ) + self.register_buffer( + "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False + ) + + @torch.compile(dynamic=True) + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len and seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False + ) + self.register_buffer( + "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False + ) + + +class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False + ) + self.register_buffer( + "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False + ) + + +class LlamaMutiRotaryEmbedding(LlamaRotaryEmbedding): + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + super().__init__(dim, max_position_embeddings, base, device) + self.scaling_factor = scaling_factor + + def forward(self, x, position_ids): + # In contrast to other models, Qwen2_5_VL has different position ids for the grids + # So we expand the inv_freq to shape (3, ...) + inv_freq_expanded = ( + self.inv_freq[None, None, :, None] + .float() + .expand(3, position_ids.shape[1], -1, 1) + ) + position_ids_expanded = position_ids[ + :, :, None, : + ].float() # shape (3, bs, 1, positions) + + device_type = ( + x.device.type + if isinstance(x.device.type, str) and x.device.type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(2, 3) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.scaling_factor + sin = emb.sin() * self.scaling_factor + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim( + num_rotations, dim, base=10000, max_position_embeddings=2048 +): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def yarn_find_correction_range( + low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 +): + low = math.floor( + yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + ) + high = math.ceil( + yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + ) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +def yarn_linear_ramp_mask(min_val, max_val, dim): + if min_val == max_val: + max_val += 0.001 # Prevent singularity + linear_func = (torch.arange(dim, dtype=torch.float32) - min_val) / ( + max_val - min_val + ) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +class LlamaYarnRotaryEmbedding(LlamaRotaryEmbedding): + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + original_max_position_embeddings=4096, + beta_fast=32, + beta_slow=1, + mscale=1, + mscale_all_dim=0, + ): + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = mscale + self.mscale_all_dim = mscale_all_dim + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + dim = self.dim + + freq_extra = 1.0 / ( + self.base + ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + freq_inter = 1.0 / ( + self.scaling_factor + * self.base + ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + dim, + self.base, + self.original_max_position_embeddings, + ) + inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to( + device=device, dtype=torch.float32 + ) + inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(seq_len, device=device, dtype=torch.float32) + + freqs = torch.outer(t, inv_freq) + + _mscale = float( + yarn_get_mscale(self.scaling_factor, self.mscale) + / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim) + ) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", + (emb.cos() * _mscale)[None, None, :, :].to(dtype), + persistent=False, + ) + self.register_buffer( + "sin_cached", + (emb.sin() * _mscale)[None, None, :, :].to(dtype), + persistent=False, + ) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + if hasattr(config, "head_dim"): + self.head_dim = config.head_dim + else: + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + + self.q_proj = nn.Linear( + self.hidden_size * 2, self.num_heads * self.head_dim, bias=False + ) + self.k_proj = nn.Linear( + self.hidden_size * 2, self.num_key_value_heads * self.head_dim, bias=False + ) + self.v_proj = nn.Linear( + self.hidden_size * 2, self.num_key_value_heads * self.head_dim, bias=False + ) + self.o_proj = nn.Linear( + self.num_heads * self.head_dim, self.hidden_size, bias=False + ) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=getattr(self.config, "rope_theta", 10000), + ) + else: + rope_scaling = self.config.rope_scaling + + def rope_get(key, default=None): + if isinstance(rope_scaling, dict): + return rope_scaling.get(key, default) + return getattr(rope_scaling, key, default) + + scaling_type = rope_get("rope_type", rope_get("type")) + scaling_factor = rope_get("factor") + + if scaling_type == "default": + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=getattr(self.config, "rope_theta", 10000), + ) + return + elif scaling_type == "linear": + if scaling_factor is None: + raise ValueError( + "Linear RoPE scaling requires 'factor' in rope_scaling config." + ) + self.rotary_emb = LlamaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + ) + elif scaling_type == "dynamic": + if scaling_factor is None: + raise ValueError( + "Dynamic RoPE scaling requires 'factor' in rope_scaling config." + ) + self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + ) + elif scaling_type == "llama3": + # for nv type + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=getattr(self.config, "rope_theta", 10000), + scaling_factor=( + scaling_factor if scaling_factor is not None else 1.0 + ), + low_freq_factor=rope_get("low_freq_factor"), + high_freq_factor=rope_get("high_freq_factor"), + orig_max_position=rope_get("original_max_position_embeddings"), + ) + elif scaling_type == "mrope": + self.rotary_emb = LlamaMutiRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings + ) + elif scaling_type == "yarn": + self.rotary_emb = LlamaYarnRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + original_max_position_embeddings=rope_get( + "original_max_position_embeddings" + ), + scaling_factor=scaling_factor, + beta_fast=rope_get("beta_fast"), + beta_slow=rope_get("beta_slow"), + mscale=rope_get("mscale"), + mscale_all_dim=rope_get("mscale_all_dim"), + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + cache_hidden: Optional[List[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + if cache_hidden is None: + if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding): + cos, sin = self.rotary_emb(query_states, position_ids) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_multimodal_rotary_pos_emb( + query_states, + key_states, + cos, + sin, + self.config.rope_scaling["mrope_section"], + ) + else: + cos, sin = self.rotary_emb(query_states, seq_len=q_len) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + dropout_p=0.0, + ) + + else: + lck = len(cache_hidden[0]) + if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding): + cos, sin = self.rotary_emb(query_states, position_ids + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_multimodal_rotary_pos_emb( + query_states, + key_states, + cos, + sin, + self.config.rope_scaling["mrope_section"], + ) + else: + cos, sin = self.rotary_emb(query_states, seq_len=q_len + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + lck + ) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + cache_hidden[0] = cache_hidden[0] + [key_states] + cache_hidden[1] = cache_hidden[1] + [value_states] + + cache_k = cache_hidden[0] + cache_v = cache_hidden[1] + + k0 = cache_k[0] + v0 = cache_v[0] + + # causal + attn_weights = torch.matmul(query_states, k0.transpose(2, 3)) / math.sqrt( + self.head_dim + ) + lck = len(cache_k) + + attn_weights = attn_weights + attention_mask + + for i in range(1, lck): + ki = cache_k[i] + qi = query_states + kiq = ki + + attn_weightsi = (qi * kiq).sum(-1) / math.sqrt(self.head_dim) + attn_weights = torch.cat( + (attn_weights, attn_weightsi[..., None]), dim=-1 + ) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(query_states.dtype) + attn_weights0 = attn_weights[..., :q_len] + + attn_output = torch.matmul(attn_weights0, v0) + + for i in range(1, lck): + vi = cache_v[i] + attn_weightsi = attn_weights[..., q_len + i - 1] + attn_outputi = attn_weightsi[..., None] * vi + attn_output = attn_output + attn_outputi + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.head_dim * self.num_heads) + + attn_output = self.o_proj(attn_output) + + return attn_output + + +class LlamaFlexAttention(LlamaAttention): + """ + Attention layer implemented with flex attention. We keep the parameters consistent with LlamaAttention. + The used parameters are: + - hidden_states: input hidden states + - attention_mask: attention mask not expanded, straight from data loader. + - position_ids: position ids + - past_key_values: dynamic cache used for storing past key and value states. + """ + + def forward( + self, + hidden_states: torch.Tensor, + cache_hidden: Optional[List[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + lck = past_seen_tokens // q_len + if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding): + cos, sin = self.rotary_emb(query_states, position_ids + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_multimodal_rotary_pos_emb( + query_states, + key_states, + cos, + sin, + self.config.rope_scaling["mrope_section"], + ) + else: + cos, sin = self.rotary_emb(query_states, seq_len=q_len + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + # Keep positions ids aligned when padding so the KV cache is unaffected. + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + lck + ) + + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + q_len, device=hidden_states.device + ) + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + + key_cache, value_cache = past_key_values.update( + key_states, + value_states, + layer_idx=0, # TODO: support multiple layers + cache_kwargs=cache_kwargs, + ) + + seq_lengths = attention_mask.sum(dim=-1) + # Shrink the attention mask to align with the padding to the right. + # This is equivalent to the shrinking logic in eagle3.py + seq_lengths -= lck + # TODO: Remove the usage of uncompiled create_block_mask after + # https://github.com/pytorch/pytorch/issues/160018 + if q_len <= 128: + create_block_mask_func = create_block_mask + flex_attention_func = flex_attention + else: + create_block_mask_func = compile_friendly_create_block_mask + flex_attention_func = compile_friendly_flex_attention + + block_mask = create_block_mask_func( + mask_mod=generate_eagle3_mask( + seq_lengths=seq_lengths, + Q_LEN=q_len, + KV_LEN=key_cache.shape[-2], + lck=lck, + ), + B=bsz, + H=1, # Rely on broadcast + Q_LEN=q_len, + KV_LEN=key_cache.shape[-2], + device=query_states.device, + ) + attn_output = flex_attention_func( + query=query_states, + key=key_cache.contiguous(), + value=value_cache.contiguous(), + block_mask=block_mask, + enable_gqa=True, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.head_dim * self.num_heads) + attn_output = self.o_proj(attn_output) + return attn_output + + +class LlamaFlashAttention(LlamaAttention): + """ + Attention layer implemented with flash attention. We keep the parameters consistent with LlamaAttention. + The used parameters are: + - hidden_states: input hidden states + - position_ids: position ids + - cache_hidden: manual cache used for storing past key and value states + """ + + def forward( + self, + hidden_states: torch.Tensor, + cache_hidden: Optional[List[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + + lck = 0 if cache_hidden is None else len(cache_hidden[0]) + if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding): + cos, sin = self.rotary_emb(query_states, position_ids + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_multimodal_rotary_pos_emb( + query_states, + key_states, + cos, + sin, + self.config.rope_scaling["mrope_section"], + unsqueeze_dim=2, + ) + else: + cos, sin = self.rotary_emb(query_states, seq_len=q_len + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + lck, unsqueeze_dim=2 + ) + + if cache_hidden is not None: + cache_hidden[0] = cache_hidden[0] + [key_states] + cache_hidden[1] = cache_hidden[1] + [value_states] + + cache_k = cache_hidden[0] + cache_v = cache_hidden[1] + else: + cache_k = [key_states] + cache_v = [value_states] + + k0 = cache_k[0] + v0 = cache_v[0] + + assert ( + flash_attn_func is not None + ), "flash_attn is not installed, please install flash_attn if you want to use the flash attention backend" + attn_output, lse, _ = flash_attn_func( + query_states, + k0, + v0, + dropout_p=0.0, + softmax_scale=1.0 / math.sqrt(self.head_dim), + causal=True, + return_attn_probs=True, + ) + lse = lse.transpose(1, 2) + + lck = len(cache_k) + if lck > 1: + q_shape_expanded = ( + bsz, + q_len, + self.num_key_value_heads, + self.num_key_value_groups, + self.head_dim, + ) + attn_outputs = [attn_output.view(q_shape_expanded)] + lses = [lse.view(q_shape_expanded[:-1])] + + for i in range(1, lck): + ki = cache_k[i].unsqueeze(-2) + qi = query_states.view(q_shape_expanded) + vi = cache_v[i].unsqueeze(-2) + + attn_outputs.append(vi) + lses.append((qi * ki).sum(-1) / math.sqrt(self.head_dim)) + + lse = torch.logsumexp(torch.stack(lses, dim=-1), dim=-1) + attn_output = sum( + attn_outputi * torch.exp(lsei - lse).unsqueeze(-1) + for attn_outputi, lsei in zip(attn_outputs, lses) + ) + # lse is fp32, downcast attn_output back + attn_output = attn_output.to(self.o_proj.weight.dtype) + + attn_output = attn_output.reshape(bsz, q_len, self.head_dim * self.num_heads) + + attn_output = self.o_proj(attn_output) + + return attn_output + + +class LlamaUSPFlashAttention(LlamaAttention): + """ + LlamaUSPFlashAttention with Trainable Ring Attention & Correct Eagle3 Branch Merging. + """ + + def __init__(self, config): + super().__init__(config) + assert ( + dist.is_initialized() + ), f"LlamaUSPAttention requires torch.distributed; call init_distributed first." + if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding): + raise NotImplementedError( + f"LlamaMutiRotaryEmbedding is currently not supported for LlamaUSPFlashAttention." + ) + self.ring_pg = get_sp_ring_group() + self.ulysses_pg = get_sp_ulysses_group() + self.sp_ring_degree = torch.distributed.get_world_size(self.ring_pg) + self.sp_ulysses_degree = torch.distributed.get_world_size(self.ulysses_pg) + self.ring_rank = torch.distributed.get_rank(self.ring_pg) + + self.scatter_idx = 2 + self.gather_idx = 1 + self.use_sync = False + + def forward( + self, + hidden_states: torch.Tensor, + cache_hidden: Optional[List[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + + bsz, q_len, _ = hidden_states.size() + local_q_len = q_len + + # ============================================================= + # 1. Projections & Ulysses Scatter + # ============================================================= + query_states = self.q_proj(hidden_states) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + query_states = SeqAllToAll4D.apply( + self.ulysses_pg, + query_states, + self.scatter_idx, + self.gather_idx, + self.use_sync, + ) + + key_states = self.k_proj(hidden_states) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + key_states = SeqAllToAll4D.apply( + self.ulysses_pg, + key_states, + self.scatter_idx, + self.gather_idx, + self.use_sync, + ) + + value_states = self.v_proj(hidden_states) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + value_states = SeqAllToAll4D.apply( + self.ulysses_pg, + value_states, + self.scatter_idx, + self.gather_idx, + self.use_sync, + ) + + current_q_len = query_states.shape[1] + local_num_heads = query_states.shape[2] + + # Global length calculation (for RoPE) + global_q_len = q_len * self.sp_ring_degree * self.sp_ulysses_degree + # ============================================================= + # 2. RoPE & Cache Management + # ============================================================= + lck = 0 if cache_hidden is None else len(cache_hidden[0]) + + cos, sin = self.rotary_emb(query_states, seq_len=global_q_len + lck) + cos, sin = cos.to(query_states.device), sin.to(query_states.device) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + lck, unsqueeze_dim=2 + ) + + # Update Cache (Eagle3 Logic: Cache is a list of tensors for tree branches) + if cache_hidden is not None: + cache_hidden[0] = cache_hidden[0] + [key_states] + cache_hidden[1] = cache_hidden[1] + [value_states] + cache_k = cache_hidden[0] + cache_v = cache_hidden[1] + else: + cache_k = [key_states] + cache_v = [value_states] + + # ============================================================= + # 3. Hybrid Attention Computation + # ============================================================= + + # 3.1 Main Sequence (Ring Attention) + out_ring, lse_ring, _ = ring_flash_attn_func( + query_states, + cache_k[0], + cache_v[0], + dropout_p=0.0, + softmax_scale=1.0 / math.sqrt(self.head_dim), + causal=True, + window_size=(-1, -1), + alibi_slopes=None, + deterministic=False, + return_attn_probs=True, + group=self.ring_pg, + ) + + if lse_ring.dim() == 3 and lse_ring.shape[1] == local_num_heads: + acc_lse = lse_ring.transpose(1, 2).contiguous() # -> [B, S, H] + else: + acc_lse = lse_ring + + assert ( + acc_lse.shape[1] == current_q_len + ), f"LSE seq_len {acc_lse.shape[1]} mismatch with Query seq_len {current_q_len}" + + acc_out = out_ring + + # 3.2 Extras Branches (Eagle3 Point-wise Update) + if len(cache_k) > 1: + num_kv_heads_local = cache_k[0].shape[2] + local_groups = local_num_heads // num_kv_heads_local + + q_shape_expanded = ( + bsz, + current_q_len, + num_kv_heads_local, + local_groups, + self.head_dim, + ) + qi_reshaped = query_states.view(q_shape_expanded) # [B, S, KV, G, D] + + for i in range(1, len(cache_k)): + ki = cache_k[i] # [B, S, KV, D] + vi = cache_v[i] # [B, S, KV, D] + + ki_expanded = ki.unsqueeze(-2) # [B, S, KV, 1, D] + + # Dot Product: [B, S, KV, G] + score_i = (qi_reshaped * ki_expanded).sum(-1) / math.sqrt(self.head_dim) + + # Flatten back to [B, S, H_local] + step_lse = score_i.view(bsz, current_q_len, -1) + + vi_expanded = vi.unsqueeze(-2) + step_out = vi_expanded.expand(q_shape_expanded).reshape(acc_out.shape) + + # Online Softmax Update + new_lse = torch.logaddexp(acc_lse, step_lse) + + acc_out = acc_out * torch.exp(acc_lse - new_lse).unsqueeze( + -1 + ) + step_out * torch.exp(step_lse - new_lse).unsqueeze(-1) + + acc_lse = new_lse + + attn_output = acc_out.to(query_states.dtype) + + # ============================================================= + # 4. Ulysses Gather & Output Projection + # ============================================================= + attn_output = SeqAllToAll4D.apply( + self.ulysses_pg, + attn_output, + self.gather_idx, # Scatter idx: 1 (Seq) + self.scatter_idx, # Gather idx: 2 (Heads) + self.use_sync, + ) + + attn_output = attn_output.reshape( + bsz, local_q_len, self.head_dim * self.num_heads + ) + attn_output = self.o_proj(attn_output) + + return attn_output + + +class LlamaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [ + F.linear(x, gate_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ], + dim=-1, + ) + up_proj = torch.cat( + [ + F.linear(x, up_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ], + dim=-1, + ) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + @torch.compile(dynamic=True) + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config, attention_backend: str = "sdpa"): + super().__init__() + self.hidden_size = config.hidden_size + + if attention_backend == "sdpa": + self.self_attn = LlamaAttention(config=config) + elif attention_backend == "flex_attention": + print_with_rank("Using flex attention on draft model training!") + self.self_attn = LlamaFlexAttention(config=config) + elif attention_backend == "fa": + self.self_attn = LlamaFlashAttention(config=config) + elif attention_backend == "usp": + self.self_attn = LlamaUSPFlashAttention(config=config) + else: + raise ValueError(f"Unknown attention backend {attention_backend}") + + self.attention_backend = attention_backend + self.mlp = LlamaMLP(config) + # self.fc = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.hidden_norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + # if self.index!=0: + + self.post_attention_layernorm = LlamaRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + input_emb: torch.Tensor, + hidden_states: torch.Tensor, + cache_hidden: List[List[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Cache`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.hidden_norm(hidden_states) + input_emb = self.input_layernorm(input_emb) + + hidden_states = torch.cat((input_emb, hidden_states), dim=-1) + # Self Attention + hidden_states = self.self_attn( + cache_hidden=cache_hidden, + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + # outputs = (hidden_states, return_hidden) + return hidden_states + + +class LlamaForCausalLMEagle3(Eagle3DraftModel): + + config_class = LlamaConfig + + def __init__(self, config, quant_config=None, attention_backend="sdpa") -> None: + super().__init__(config) + self.config = config + self.quant_config = quant_config + + self.vocab_size = config.vocab_size + self.draft_vocab_size = config.draft_vocab_size + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, config.pad_token_id + ) + self.midlayer = LlamaDecoderLayer(config, attention_backend=attention_backend) + + if hasattr(config, "target_hidden_size"): + self.fc = torch.nn.Linear( + config.target_hidden_size * 3, config.hidden_size, bias=False + ) + else: + self.fc = torch.nn.Linear( + config.hidden_size * 3, config.hidden_size, bias=False + ) + + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.lm_head = nn.Linear( + config.hidden_size, config.draft_vocab_size, bias=False + ) + + # create vocab buffers + t2d = torch.ones(self.vocab_size, dtype=torch.bool) + d2t = torch.zeros(self.draft_vocab_size, dtype=torch.int64) + self.register_buffer("t2d", t2d) + self.register_buffer("d2t", d2t) + + def forward( + self, + hidden_states: torch.Tensor, + inputs_embeds: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ttt_length: int = 1, + ): + """ + Arguments: + hidden_states (`torch.FloatTensor`): input to the layer, cat low, mid high hidden_states of shape `(batch, seq_len, hidden_states * 3)` + input_ids (`torch.LongTensor`): input ids of shape `(batch, seq_len)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor`, *optional*): position ids of shape `(batch, seq_len)` + """ + if ttt_length == 1: + print_with_rank("using ttt_length 1, no need to cache hidden states") + cache_hidden = None + else: + print_with_rank(f"using ttt_length {ttt_length}, caching hidden states") + cache_hidden = [[], []] + + batch_size, seq_length, _ = hidden_states.size() + + # make position ids + device = hidden_states.device + position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + + # make attention mask + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length), dtype=torch.bool, device=hidden_states.device + ) + attention_mask = prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), hidden_states, 0 + ) + + # fc + hidden_states = self.fc(hidden_states) + hidden_states = self.midlayer( + input_emb=inputs_embeds, + hidden_states=hidden_states, + cache_hidden=cache_hidden, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=None, + output_attentions=False, + use_cache=False, + ) + + # norm + hidden_states = self.norm(hidden_states) + + return hidden_states + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def project_hidden_states(self, hidden_states: torch.Tensor) -> torch.Tensor: + # eagle 3 requires hidden states from 3 layers + assert hidden_states.size(-1) == self.config.hidden_size * 3 + return self.fc(hidden_states) + + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + norm_hidden_states = self.norm(hidden_states) + return self.lm_head(norm_hidden_states) + + def backbone( + self, + input_embeds: torch.Tensor, + hidden_states: torch.Tensor, + cache_hidden: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + past_key_values: Optional[Cache] = None, + use_cache: bool = True, + ) -> torch.Tensor: + return self.midlayer( + input_emb=input_embeds, + hidden_states=hidden_states, + cache_hidden=cache_hidden, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=False, + use_cache=False, + ) diff --git a/idea1/specforge/modeling/target/__init__.py b/idea1/specforge/modeling/target/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0f70b3b740d055ae72dfebacc5b9f7434f3eed0e --- /dev/null +++ b/idea1/specforge/modeling/target/__init__.py @@ -0,0 +1,17 @@ +from .eagle3_target_model import ( + CustomEagle3TargetModel, + Eagle3TargetModel, + HFEagle3TargetModel, + SGLangEagle3TargetModel, + get_eagle3_target_model, +) +from .target_head import TargetHead + +__all__ = [ + "Eagle3TargetModel", + "SGLangEagle3TargetModel", + "HFEagle3TargetModel", + "CustomEagle3TargetModel", + "get_eagle3_target_model", + "TargetHead", +] diff --git a/idea1/specforge/modeling/target/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/modeling/target/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1774e193777987914d78ac6f6b2d79a11923971 Binary files /dev/null and b/idea1/specforge/modeling/target/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/__pycache__/dflash_target_model.cpython-311.pyc b/idea1/specforge/modeling/target/__pycache__/dflash_target_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff34c86f739b2aadcc5d340f2844d9651a7e956b Binary files /dev/null and b/idea1/specforge/modeling/target/__pycache__/dflash_target_model.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/__pycache__/eagle3_target_model.cpython-311.pyc b/idea1/specforge/modeling/target/__pycache__/eagle3_target_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cf7012c770a032075b7f4856c76ca83a71a149f Binary files /dev/null and b/idea1/specforge/modeling/target/__pycache__/eagle3_target_model.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/__pycache__/target_head.cpython-311.pyc b/idea1/specforge/modeling/target/__pycache__/target_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f9da7be383c6623bc5c79910f9cc54b3a6ad5c7 Binary files /dev/null and b/idea1/specforge/modeling/target/__pycache__/target_head.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/__pycache__/target_utils.cpython-311.pyc b/idea1/specforge/modeling/target/__pycache__/target_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae04c556faedae83ce6109904f243c8b4317d29e Binary files /dev/null and b/idea1/specforge/modeling/target/__pycache__/target_utils.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__init__.py b/idea1/specforge/modeling/target/custom_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5465d15a8e84c788c43e5df709c33f4efb0bd43d --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/__init__.py @@ -0,0 +1,17 @@ +from .gpt_oss import GptOssForCausalLM +from .llama import LlamaForCausalLM +from .llama4 import Llama4ForCausalLM +from .phi3 import Phi3ForCausalLM +from .qwen2 import Qwen2ForCausalLM +from .qwen3 import Qwen3ForCausalLM +from .qwen3_moe import Qwen3MoeForCausalLM + +__all__ = [ + "GptOssForCausalLM", + "LlamaForCausalLM", + "Llama4ForCausalLM", + "Phi3ForCausalLM", + "Qwen2ForCausalLM", + "Qwen3ForCausalLM", + "Qwen3MoeForCausalLM", +] diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecf231670ec5cfa136a4c4f8043c5837cb6f559e Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/gpt_oss.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/gpt_oss.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f70e92c2e58ffe2f73dae39103299ace18ba2b78 Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/gpt_oss.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/llama.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/llama.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..157b801e9775c266194bda357c5ea1a01c759879 Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/llama.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/llama4.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/llama4.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..153b7790f3383ffcb8cc047b91e76cd702192a6b Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/llama4.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/phi3.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/phi3.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b45673b729a892951eb494b49ae0254d1c94ade7 Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/phi3.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen2.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78beed7a77cb7ea5d1483f73d64fcd2bf2d3b90f Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen2.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen3.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen3.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c240db0ef36f18bcd2541e55db267750d643e87d Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen3.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen3_moe.cpython-311.pyc b/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen3_moe.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fa45d23531946bbe5974e75b36efd82384bd9fe Binary files /dev/null and b/idea1/specforge/modeling/target/custom_backend/__pycache__/qwen3_moe.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/custom_backend/gpt_oss.py b/idea1/specforge/modeling/target/custom_backend/gpt_oss.py new file mode 100644 index 0000000000000000000000000000000000000000..b3b4a79723f48dd2b3e7db077e030f88288a3145 --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/gpt_oss.py @@ -0,0 +1,879 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable, List, Optional, Union + +import torch +import torch.distributed as dist +from torch import nn +from torch.nn import functional as F +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.integrations.hub_kernels import use_kernel_forward_from_hub +from transformers.masking_utils import ( + create_causal_mask, + create_sliding_window_causal_mask, +) +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + MoeCausalLMOutputWithPast, + MoeModelOutputWithPast, +) +from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.gpt_oss.configuration_gpt_oss import GptOssConfig +from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm +from transformers.processing_utils import Unpack +from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple +from transformers.utils.generic import check_model_inputs + +from specforge.distributed import get_tp_group, shard_tensor +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + + +class GptOssExperts(nn.Module): + def __init__(self, config): + super().__init__() + self.intermediate_size = config.intermediate_size + self.num_experts = config.num_local_experts + self.hidden_size = config.hidden_size + self.expert_dim = self.intermediate_size + + # apply tp + self.tp_group = get_tp_group() + self.tp_size = dist.get_world_size(self.tp_group) + self.expert_dim_per_shard = self.expert_dim // self.tp_size + self.gate_up_proj = nn.Parameter( + torch.empty( + self.num_experts, self.hidden_size, 2 * self.expert_dim_per_shard + ) + ) + self.gate_up_proj_bias = nn.Parameter( + torch.empty(self.num_experts, 2 * self.expert_dim_per_shard) + ) + self.down_proj = nn.Parameter( + torch.empty((self.num_experts, self.expert_dim_per_shard, self.hidden_size)) + ) + self.down_proj_bias = nn.Parameter( + torch.empty(self.num_experts, self.hidden_size) + ) + + self.alpha = 1.702 + self.limit = 7.0 + + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + if "down_proj" in state_dict: + # columnwise splitting + value = state_dict["down_proj"] + state_dict["down_proj"] = shard_tensor(value, self.tp_group, 1) + + if "down_proj_bias" in state_dict: + value = state_dict["down_proj_bias"] + if dist.get_rank(self.tp_group) != 0: + value.zero_() + + if "gate_up_proj_bias" in state_dict: + value = state_dict["gate_up_proj_bias"] + state_dict["gate_up_proj_bias"] = shard_tensor(value, self.tp_group, 1) + + if "gate_up_proj" in state_dict: + value = state_dict["gate_up_proj"] + gate, up = value[..., ::2], value[..., 1::2] + gate = shard_tensor(gate, self.tp_group, 2) + up = shard_tensor(up, self.tp_group, 2) + new_value = torch.zeros_like(self.gate_up_proj, device=value.device) + new_value[..., ::2] = gate + new_value[..., 1::2] = up + state_dict["gate_up_proj"] = new_value + + def forward( + self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None + ) -> torch.Tensor: + """ + When training is is more efficient to just loop over the experts and compute the output for each expert + as otherwise the memory would explode. + + For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs. + + Args: + hidden_states (torch.Tensor): (batch_size, seq_len, hidden_size) + selected_experts (torch.Tensor): (batch_size * token_num, top_k) + routing_weights (torch.Tensor): (batch_size * token_num, num_experts) + Returns: + torch.Tensor + """ + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape( + -1, self.hidden_size + ) # (num_tokens, hidden_size) + num_experts = routing_weights.shape[1] + if self.training: + next_states = torch.zeros_like( + hidden_states, dtype=hidden_states.dtype, device=hidden_states.device + ) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot( + router_indices, num_classes=num_experts + ) + expert_mask = expert_mask.permute(2, 1, 0) + # we sum on the top_k and on the sequence lenght to get which experts + # are hit this time around + expert_hitted = torch.greater( + expert_mask.sum(dim=(-1, -2)), 0 + ).nonzero() + for expert_idx in expert_hitted[:]: + with torch.no_grad(): + _, token_idx = torch.where(expert_mask[expert_idx[0]]) + current_state = hidden_states[token_idx] + gate_up = ( + current_state @ self.gate_up_proj[expert_idx] + + self.gate_up_proj_bias[expert_idx] + ) + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + out = ( + gated_output @ self.down_proj[expert_idx] + + self.down_proj_bias[expert_idx] + ) + weighted_output = out[0] * routing_weights[token_idx, expert_idx, None] + next_states.index_add_( + 0, token_idx, weighted_output.to(hidden_states.dtype) + ) + next_states = next_states.view(batch_size, -1, self.hidden_size) + else: + hidden_states = hidden_states.repeat(num_experts, 1) + hidden_states = hidden_states.view(num_experts, -1, self.hidden_size) + gate_up = ( + torch.bmm(hidden_states, self.gate_up_proj) + + self.gate_up_proj_bias[..., None, :] + ) + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + next_states = torch.bmm(((up + 1) * glu), self.down_proj) + next_states = next_states + self.down_proj_bias[..., None, :] + next_states = next_states.view( + num_experts, batch_size, -1, self.hidden_size + ) + next_states = ( + next_states + * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[ + ..., None + ] + ) + dist.all_reduce(next_states, op=dist.ReduceOp.SUM, group=self.tp_group) + + next_states = next_states.sum(dim=0) + return next_states + + +class GptOssTopKRouter(nn.Module): + def __init__(self, config): + super().__init__() + self.top_k = config.num_experts_per_tok + self.num_experts = config.num_local_experts + self.hidden_dim = config.hidden_size + self.weight = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim)) + self.bias = nn.Parameter(torch.empty(self.num_experts)) + + def forward(self, hidden_states): + hidden_states = hidden_states.reshape(-1, self.hidden_dim) + router_logits = F.linear( + hidden_states, self.weight, self.bias + ) # (seq_len, num_experts) + router_top_value, router_indices = torch.topk( + router_logits, self.top_k, dim=-1 + ) # (seq_len, top_k) + router_top_value = torch.nn.functional.softmax( + router_top_value, dim=1, dtype=router_top_value.dtype + ) + router_scores = torch.zeros_like(router_logits).scatter_( + 1, router_indices, router_top_value + ) + return router_scores, router_indices + + +@use_kernel_forward_from_hub("MegaBlocksMoeMLP") +class GptOssMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.router = GptOssTopKRouter(config) + self.experts = GptOssExperts(config) + + def forward(self, hidden_states): + router_scores, router_indices = self.router( + hidden_states + ) # (num_experts, seq_len) + routed_out = self.experts( + hidden_states, router_indices=router_indices, routing_weights=router_scores + ) + return routed_out, router_scores + + +class GptOssRotaryEmbedding(nn.Module): + def __init__(self, config: GptOssConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get( + "rope_type", config.rope_scaling.get("type") + ) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = ( + self.inv_freq[None, :, None] + .float() + .expand(position_ids.shape[0], -1, 1) + .to(x.device) + ) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = ( + x.device.type + if isinstance(x.device.type, str) and x.device.type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(1, 2) + emb = freqs + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(x.dtype), sin.to(x.dtype) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand( + batch, num_key_value_heads, n_rep, slen, head_dim + ) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def _apply_rotary_emb( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> torch.Tensor: + first_half, second_half = torch.chunk(x, 2, dim=-1) + first_ = first_half * cos - second_half * sin + second_ = second_half * cos + first_half * sin + return torch.cat((first_, second_), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = _apply_rotary_emb(q, cos, sin) + k_embed = _apply_rotary_emb(k, cos, sin) + return q_embed, k_embed + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + sinks = module.sinks.reshape(1, -1, 1, 1).expand( + query.shape[0], -1, query.shape[-2], -1 + ) + combined_logits = torch.cat([attn_weights, sinks], dim=-1) + + # This was not in the original implementation and slightly affect results; it prevents overflow in BF16/FP16 + # when training with bsz>1 we clamp max values. + + combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values + probs = F.softmax(combined_logits, dim=-1, dtype=combined_logits.dtype) + scores = probs[..., :-1] # we drop the sink here + attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +class GptOssAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: GptOssConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + # self.q_proj = nn.Linear( + # config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + # ) + # self.k_proj = nn.Linear( + # config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + # ) + # self.v_proj = nn.Linear( + # config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + # ) + # self.o_proj = nn.Linear( + # config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + # ) + # self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + # self.sinks = nn.Parameter(torch.empty(config.num_attention_heads)) + + self.tp_group = get_tp_group() + self.tp_size = dist.get_world_size(self.tp_group) + self.q_proj = ColumnParallelLinear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=config.attention_bias, + ) + self.k_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.v_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + self.num_attention_heads_per_shard = config.num_attention_heads // self.tp_size + self.sliding_window = ( + config.sliding_window + if config.layer_types[layer_idx] == "sliding_attention" + else None + ) + self.sinks = nn.Parameter(torch.empty(self.num_attention_heads_per_shard)) + + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + if "sinks" in state_dict: + value = state_dict["sinks"] + state_dict["sinks"] = shard_tensor(value, self.tp_group, 0) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + if past_key_value is not None: + cache_kwargs = {"cache_position": cache_position} + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, + s_aux=self.sinks, # diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +class GptOssDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: GptOssConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = GptOssAttention(config=config, layer_idx=layer_idx) + self.mlp = GptOssMLP(config) + self.input_layernorm = GptOssRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = GptOssRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.attention_type = config.layer_types[layer_idx] + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states, _ = self.mlp(hidden_states) # diff with llama: router scores + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class GptOssPreTrainedModel(PreTrainedModel): + config: GptOssConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["GptOssDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = False + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = {} + _keep_in_fp32_modules = ["post_attention_layernorm", "input_layernorm", "norm"] + _supports_flash_attention = False + _supports_flex_attention = False + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Parameter): + module.data.normal_(mean=0.0, std=std) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GptOssRMSNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, GptOssExperts): + module.gate_up_proj.data.normal_(mean=0.0, std=std) + module.gate_up_proj_bias.data.zero_() + module.down_proj.data.normal_(mean=0.0, std=std) + module.down_proj_bias.data.zero_() + elif isinstance(module, GptOssAttention): + module.sinks.data.normal_(mean=0.0, std=std) + elif isinstance(module, GptOssTopKRouter): + module.weight.data.normal_(mean=0.0, std=std) + module.bias.data.normal_(mean=0.0, std=std) + + +@auto_docstring +class GptOssModel(GptOssPreTrainedModel): + _no_split_modules = ["GptOssDecoderLayer"] + + def __init__(self, config: GptOssConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + GptOssDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = GptOssRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = GptOssRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> MoeModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + layers_to_output_hidden_states: Optional[List[int]] = kwargs.pop( + "layers_to_output_hidden_states", None + ) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + } + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), + } + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () + for idx, decoder_layer in enumerate(self.layers): + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_ids=position_ids, + past_key_value=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + hidden_states = self.norm(hidden_states) + + return MoeModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + ) + + +def load_balancing_loss_func( + gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], + num_experts: Optional[int] = None, + top_k=2, + attention_mask: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, int]: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + gate_logits: + Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [batch_size X sequence_length, num_experts]. + num_experts: + Number of experts + top_k: + The number of experts to route per-token, can be also interpreted as the `top-k` routing + parameter. + attention_mask (`torch.Tensor`, *optional*): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return 0 + + if isinstance(gate_logits, tuple): + compute_device = gate_logits[0].device + concatenated_gate_logits = torch.cat( + [layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0 + ) + + routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1) + + _, selected_experts = torch.topk(routing_weights, top_k, dim=-1) + + expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts) + + if attention_mask is None: + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.mean(expert_mask.float(), dim=0) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.mean(routing_weights, dim=0) + else: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = concatenated_gate_logits.shape[0] // ( + batch_size * sequence_length + ) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = ( + attention_mask[None, :, :, None, None] + .expand( + (num_hidden_layers, batch_size, sequence_length, top_k, num_experts) + ) + .reshape(-1, top_k, num_experts) + .to(compute_device) + ) + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.sum( + expert_mask.float() * expert_attention_mask, dim=0 + ) / torch.sum(expert_attention_mask, dim=0) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, num_experts)) + .reshape(-1, num_experts) + .to(compute_device) + ) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.sum( + routing_weights * router_per_expert_attention_mask, dim=0 + ) / torch.sum(router_per_expert_attention_mask, dim=0) + + overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0)) + return overall_loss * num_experts + + +@auto_docstring +class GptOssForCausalLM(GptOssPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = GptOssModel(config) + self.vocab_size = config.vocab_size + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + self.router_aux_loss_coef = config.router_aux_loss_coef + self.num_experts = config.num_local_experts + self.num_experts_per_tok = config.num_experts_per_tok + + # Initialize weights and apply final processing + self.post_init() + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> MoeCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, GptOssForCausalLM + + >>> model = GptOssForCausalLM.from_pretrained("mistralai/GptOss-8x7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/GptOss-8x7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_router_logits = ( + output_router_logits + if output_router_logits is not None + else self.config.output_router_logits + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: MoeModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_router_logits=output_router_logits, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + aux_loss = None + if output_router_logits: + aux_loss = load_balancing_loss_func( + outputs.router_logits, + self.num_experts, + self.num_experts_per_tok, + attention_mask, + ) + if labels is not None: + loss += self.router_aux_loss_coef * aux_loss.to( + loss.device + ) # make sure to reside in the same device + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) + + +__all__ = ["GptOssForCausalLM", "GptOssModel", "GptOssPreTrainedModel"] diff --git a/idea1/specforge/modeling/target/custom_backend/llama.py b/idea1/specforge/modeling/target/custom_backend/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..04a3f6c9bd40b684e5d287ddf4477ea50cfa68c8 --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/llama.py @@ -0,0 +1,460 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable, List, Optional, Union + +import torch +import torch.distributed as dist +from torch import nn +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.masking_utils import create_causal_mask +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import ( + LlamaRMSNorm, + LlamaRotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, +) +from transformers.processing_utils import Unpack +from transformers.utils import TransformersKwargs, logging +from transformers.utils.generic import check_model_inputs + +from specforge.distributed import get_tp_group +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + +logger = logging.get_logger(__name__) + + +class TensorParallelLlamaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + + self.tp_group = get_tp_group() + self.gate_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=config.mlp_bias + ) + self.up_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=config.mlp_bias + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, self.hidden_size, bias=config.mlp_bias + ) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + dist.all_reduce(down_proj, op=dist.ReduceOp.SUM, group=self.tp_group) + return down_proj + + +class TensorParallelLlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + # self.q_proj = nn.Linear( + # config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + # ) + # self.k_proj = nn.Linear( + # config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + # ) + # self.v_proj = nn.Linear( + # config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + # ) + # self.o_proj = nn.Linear( + # config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + # ) + + # distributed linear layers + self.tp_group = get_tp_group() + self.q_proj = ColumnParallelLinear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=config.attention_bias, + ) + self.k_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.v_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +class TensorParallelLlamaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: LlamaConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = TensorParallelLlamaAttention( + config=config, layer_idx=layer_idx + ) + + self.mlp = TensorParallelLlamaMLP(config) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class LlamaPreTrainedModel(PreTrainedModel): + config: LlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["TensorParallelLlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = {} + + +class LlamaModel(LlamaPreTrainedModel): + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + TensorParallelLlamaDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = LlamaRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + layers_to_output_hidden_states: Optional[List[int]] = kwargs.pop( + "layers_to_output_hidden_states", None + ) + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () + for idx, decoder_layer in enumerate(self.layers): + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + ) + + +class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + self.vocab_size = config.vocab_size + + # distributed the lm head + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + ) + + +__all__ = [ + "LlamaForCausalLM", + "LlamaModel", +] diff --git a/idea1/specforge/modeling/target/custom_backend/llama4.py b/idea1/specforge/modeling/target/custom_backend/llama4.py new file mode 100644 index 0000000000000000000000000000000000000000..22f807daed1f6a1b1535745afb95a4feee7e3d0b --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/llama4.py @@ -0,0 +1,613 @@ +# coding=utf-8 +# Copyright 2025 The LLAMA4 and HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, List, Optional, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.integrations.hub_kernels import use_kernel_forward_from_hub +from transformers.masking_utils import create_causal_mask, create_chunked_causal_mask +from transformers.modeling_flash_attention_utils import FlashAttentionKwargs +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.llama4.configuration_llama4 import ( + Llama4Config, + Llama4TextConfig, +) +from transformers.models.llama4.modeling_llama4 import ( + Llama4Router, + Llama4TextL2Norm, + Llama4TextRMSNorm, + Llama4TextRotaryEmbedding, + Llama4VisionModel, + apply_rotary_emb, + eager_attention_forward, +) +from transformers.processing_utils import Unpack +from transformers.utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + logging, +) +from transformers.utils.deprecation import deprecate_kwarg +from transformers.utils.generic import check_model_inputs + +# [MODIFIED] Import from transformers library +from specforge.distributed import get_tp_group, shard_tensor +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + +logger = logging.get_logger(__name__) + + +class Llama4TextExperts(nn.Module): + def __init__(self, config: Llama4TextConfig): + super().__init__() + self.num_experts = config.num_local_experts + self.intermediate_size = config.intermediate_size + self.hidden_size = config.hidden_size + self.expert_dim = self.intermediate_size + + self.tp_group = get_tp_group() + self.tp_size = dist.get_world_size(self.tp_group) + self.expert_dim_per_shard = self.expert_dim // self.tp_size + self.gate_up_proj = nn.Parameter( + torch.empty( + self.num_experts, self.hidden_size, 2 * self.expert_dim_per_shard + ) + ) + self.down_proj = nn.Parameter( + torch.empty((self.num_experts, self.expert_dim_per_shard, self.hidden_size)) + ) + self.act_fn = ACT2FN[config.hidden_act] + + # deal with weight loading and sharding + self._register_load_state_dict_pre_hook(self.shard_state_dict) + + def shard_state_dict(self, state_dict, *args): + if "down_proj" in state_dict: + value = state_dict["down_proj"] + state_dict["down_proj"] = shard_tensor(value, self.tp_group, 1) + + if "gate_up_proj" in state_dict: + value = state_dict["gate_up_proj"] + gate, up = value.chunk(2, dim=-1) + gate = shard_tensor(gate, self.tp_group, -1) + up = shard_tensor(up, self.tp_group, -1) + value = torch.cat((gate, up), dim=-1) + state_dict["gate_up_proj"] = value + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + This should really not be run on a single machine, as we are reaching compute bound: + - the inputs are expected to be "sorted" per expert already. + - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape + + Args: + hidden_states (torch.Tensor): (batch_size * token_num, hidden_size) + selected_experts (torch.Tensor): (batch_size * token_num, top_k) + routing_weights (torch.Tensor): (batch_size * token_num, top_k) + Returns: + torch.Tensor + """ + hidden_states = hidden_states.view( + self.gate_up_proj.shape[0], -1, self.hidden_size + ) + gate_up = torch.bmm(hidden_states, self.gate_up_proj) + gate, up = gate_up.chunk(2, dim=-1) # not supported for DTensors + next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj) + dist.all_reduce(next_states, op=dist.ReduceOp.SUM, group=self.tp_group) + next_states = next_states.view(-1, self.hidden_size) + return next_states + + +class Llama4TextMLP(nn.Module): + def __init__(self, config, intermediate_size=None): + super().__init__() + + if intermediate_size is None: + intermediate_size = config.intermediate_size + + self.config = config + self.tp_group = get_tp_group() + self.gate_proj = ColumnParallelLinear( + config.hidden_size, intermediate_size, bias=False + ) + self.up_proj = ColumnParallelLinear( + config.hidden_size, intermediate_size, bias=False + ) + self.down_proj = RowParallelLinear( + intermediate_size, config.hidden_size, bias=False + ) + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.activation_fn(self.gate_proj(x)) * self.up_proj(x) + out = self.down_proj(down_proj) + dist.all_reduce(out, op=dist.ReduceOp.SUM, group=self.tp_group) + return out + + +class Llama4TextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Llama4TextConfig, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.num_attention_heads = config.num_attention_heads + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.num_key_value_heads = config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attn_scale = config.attn_scale + self.floor_scale = config.floor_scale + self.attn_temperature_tuning = config.attn_temperature_tuning + self.attention_dropout = config.attention_dropout + self.is_causal = True + self.use_rope = config.no_rope_layers[layer_idx] + + self.tp_group = get_tp_group() + self.q_proj = ColumnParallelLinear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=config.attention_bias, + ) + self.k_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.v_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + if self.config.use_qk_norm and self.use_rope: + self.qk_norm = Llama4TextL2Norm(config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape) + key_states = self.k_proj(hidden_states).view(*input_shape, -1, self.head_dim) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + if self.use_rope: # the 16E model skips rope for long context on certain layers + query_states, key_states = apply_rotary_emb( + query_states, key_states, position_embeddings.to(query_states.device) + ) + + if hasattr(self, "qk_norm"): # the 128E model does not use qk_norm + query_states = self.qk_norm(query_states) + key_states = self.qk_norm(key_states) + + # Use temperature tuning from https://huggingface.co/papers/2501.19399) to NoROPE layers + if self.attn_temperature_tuning and not self.use_rope: + attn_scales = ( + torch.log1p( + torch.floor((cache_position.float() + 1.0) / self.floor_scale) + ) + * self.attn_scale + + 1.0 + ) + attn_scales = attn_scales.view((1, input_shape[-1], 1, 1)).expand( + (*input_shape, 1, 1) + ) # batch size > 1 + query_states = (query_states * attn_scales).to(query_states.dtype) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"cache_position": cache_position} + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +@use_kernel_forward_from_hub("Llama4TextMoe") +class Llama4TextMoe(nn.Module): + def __init__(self, config): + super().__init__() + self.top_k = config.num_experts_per_tok + self.hidden_dim = config.hidden_size + self.num_experts = config.num_local_experts + self.experts = Llama4TextExperts(config) + self.router = Llama4Router(config) + self.shared_expert = Llama4TextMLP(config) + + def forward(self, hidden_states): + hidden_states = hidden_states.reshape(-1, self.hidden_dim) + router_scores, router_logits = self.router(hidden_states) + routed_in = hidden_states.repeat(router_scores.shape[1], 1) + routed_in = routed_in * router_scores.transpose(0, 1).reshape(-1, 1) + routed_out = self.experts(routed_in) + out = self.shared_expert(hidden_states) + out.add_( + routed_out.reshape(router_scores.shape[1], -1, routed_out.shape[-1]).sum( + dim=0 + ) + ) + return out, router_logits + + +class Llama4TextDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx): + super().__init__() + self.hidden_size = config.hidden_size + self.layer_idx = layer_idx + self.attention_type = config.layer_types[layer_idx] + self.self_attn = Llama4TextAttention(config, layer_idx) + self.is_moe_layer = layer_idx in config.moe_layers + if self.is_moe_layer: # the 128E model interleaves dense / sparse + self.feed_forward = Llama4TextMoe(config) + else: + self.feed_forward = Llama4TextMLP( + config, intermediate_size=config.intermediate_size_mlp + ) + + self.input_layernorm = Llama4TextRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = Llama4TextRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[ + torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attention_states, _ = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + attention_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + if self.is_moe_layer: + hidden_states, _ = hidden_states + hidden_states = residual + hidden_states.view(residual.shape) + return hidden_states + + +@auto_docstring +class Llama4PreTrainedModel(PreTrainedModel): + config: Llama4Config + supports_gradient_checkpointing = True + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = False + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, Llama4TextRMSNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, Llama4TextExperts): + module.gate_up_proj.data.normal_(mean=0.0, std=std) + module.down_proj.data.normal_(mean=0.0, std=std) + elif isinstance(module, Llama4VisionModel): + module.class_embedding.data.normal_(std=module.scale) + module.positional_embedding_vlm.data.normal_(std=module.scale) + + +@auto_docstring +class Llama4TextModel(Llama4PreTrainedModel): + _no_split_modules = ["Llama4TextDecoderLayer"] + base_model_prefix = "model" + config: Llama4TextConfig + _can_record_outputs = {} + + def __init__(self, config: Llama4TextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + Llama4TextDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Llama4TextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + layers_to_output_hidden_states: Optional[List[int]] = kwargs.pop( + "layers_to_output_hidden_states", None + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens( + input_ids.to(self.embed_tokens.weight.device) + ) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + "chunked_attention": create_chunked_causal_mask(**mask_kwargs), + } + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + freq_cis = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () + for idx, decoder_layer in enumerate(self.layers): + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=freq_cis, + **kwargs, + ) + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + hidden_states=all_hidden_states, + ) + + +class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin): + _no_split_modules = ["Llama4TextDecoderLayer"] + base_model_prefix = "language_model" + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + config: Llama4TextConfig + + def __init__(self, config: Llama4TextConfig): + super().__init__(config) + self.model = Llama4TextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, Llama4ForCausalLM + + >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/idea1/specforge/modeling/target/custom_backend/phi3.py b/idea1/specforge/modeling/target/custom_backend/phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..2515701f90f8c58cd164fc3e345549877212f379 --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/phi3.py @@ -0,0 +1,495 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, List, Optional, Union + +import torch +import torch.distributed as dist +from torch import nn +from transformers import Phi3Config +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.masking_utils import ( + create_causal_mask, + create_sliding_window_causal_mask, +) +from transformers.modeling_flash_attention_utils import FlashAttentionKwargs +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.phi3.modeling_phi3 import ( + Phi3RMSNorm, + Phi3RotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, +) +from transformers.processing_utils import Unpack +from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple +from transformers.utils.deprecation import deprecate_kwarg +from transformers.utils.generic import check_model_inputs + +from specforge.distributed import get_tp_group +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + + +class Phi3MLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + + # Add TP support + self.tp_group = get_tp_group() + + self.gate_up_proj = ColumnParallelLinear( + config.hidden_size, + 2 * config.intermediate_size, + bias=False, + layout_type="gate_up", + ) + self.down_proj = RowParallelLinear( + config.intermediate_size, config.hidden_size, bias=False + ) + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + down_proj = self.down_proj(up_states) + # Add all_reduce for TP + dist.all_reduce(down_proj, op=dist.ReduceOp.SUM, group=self.tp_group) + return down_proj + + +class Phi3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.num_key_value_heads = config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + # Add TP support + self.tp_group = get_tp_group() + tp_size = dist.get_world_size(self.tp_group) + + # Adjust head counts for TP + self.num_attention_heads_per_rank = config.num_attention_heads // tp_size + self.num_key_value_heads_per_rank = config.num_key_value_heads // tp_size + + # ColumnParallel splits the full QKV output across ranks + op_size = config.num_attention_heads * self.head_dim + 2 * ( + config.num_key_value_heads * self.head_dim + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=False + ) + self.qkv_proj = ColumnParallelLinear( + config.hidden_size, op_size, bias=False, layout_type="merged_qkv" + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_attention_heads_per_rank * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[ + ..., + query_pos : query_pos + self.num_key_value_heads_per_rank * self.head_dim, + ] + value_states = qkv[ + ..., query_pos + self.num_key_value_heads_per_rank * self.head_dim : + ] + + query_states = query_states.view(hidden_shape).transpose(1, 2) + key_states = key_states.view(hidden_shape).transpose(1, 2) + value_states = value_states.view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + # Add all_reduce for TP + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +class Phi3DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Phi3Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx) + self.mlp = Phi3MLP(config) + self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Phi3RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.config = config + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[ + torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + self.resid_attn_dropout( + hidden_states + ) # main diff with Llama + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout( + hidden_states + ) # main diff with Llama + return hidden_states + + +@auto_docstring +class Phi3PreTrainedModel(PreTrainedModel): + config: Phi3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Phi3DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = {} + _version = "0.0.5" + + +@auto_docstring +class Phi3Model(Phi3PreTrainedModel): + def __init__(self, config: Phi3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + Phi3DecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Phi3RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + layers_to_output_hidden_states: Optional[List[int]] = kwargs.pop( + "layers_to_output_hidden_states", None + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = ( + create_causal_mask + if self.config.sliding_window is None + else create_sliding_window_causal_mask + ) + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () + for idx, decoder_layer in enumerate(self.layers): + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + hidden_states=all_hidden_states, + ) + + +@auto_docstring +class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Phi3Model(config) + self.vocab_size = config.vocab_size + + # Use ColumnParallelLinear for lm_head + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, Phi3ForCausalLM + + >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the + # process + + # When the first time input length reached long and short factor switching point, enforce re-compute cache + # It will cause downside of slower at this single token position, however, better than current failure. + if ( + past_key_values + and self.config.rope_scaling + and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 + ): + past_length = cache_position[0] + if past_length <= self.config.original_max_position_embeddings: + past_key_values = None + + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + logits_to_keep=logits_to_keep, + **kwargs, + ) + return model_inputs diff --git a/idea1/specforge/modeling/target/custom_backend/qwen2.py b/idea1/specforge/modeling/target/custom_backend/qwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..c7ea42f95b4ca6b28bc17584b616b909703f3293 --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/qwen2.py @@ -0,0 +1,829 @@ +# coding=utf-8 +# Copyright 2025 The Qwen2 and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.masking_utils import ( + create_causal_mask, + create_sliding_window_causal_mask, +) +from transformers.modeling_flash_attention_utils import FlashAttentionKwargs +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + QuestionAnsweringModelOutput, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.qwen2.configuration_qwen2 import Qwen2Config +from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2RMSNorm, + Qwen2RotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, +) +from transformers.processing_utils import Unpack +from transformers.utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + logging, +) + +# [MODIFIED] Import from distributed library +from specforge.distributed import get_tp_group +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + +logger = logging.get_logger(__name__) + + +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + # distributed linear layers + self.tp_group = get_tp_group() + self.gate_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=False + ) + self.up_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=False + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, self.hidden_size, bias=False + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + dist.all_reduce(down_proj, op=dist.ReduceOp.SUM, group=self.tp_group) + return down_proj + + +class Qwen2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + # distributed linear layers + self.tp_group = get_tp_group() + self.q_proj = ColumnParallelLinear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=True, + ) + self.k_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=True, + ) + self.v_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=True, + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=False, + ) + + self.sliding_window = ( + config.sliding_window + if config.layer_types[layer_idx] == "sliding_attention" + else None + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, # main diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +class Qwen2DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Qwen2Attention(config=config, layer_idx=layer_idx) + + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.attention_type = config.layer_types[layer_idx] + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[ + torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_flex_attn = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, Qwen2RMSNorm): + module.weight.data.fill_(1.0) + + +@auto_docstring +class Qwen2Model(Qwen2PreTrainedModel): + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + Qwen2DecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Qwen2RotaryEmbedding(config=config) + self.gradient_checkpointing = False + self.has_sliding_layers = "sliding_attention" in self.config.layer_types + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + layers_to_output_hidden_states = flash_attn_kwargs.pop( + "layers_to_output_hidden_states", None + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache + if not isinstance(past_key_values, (type(None), Cache)): + raise ValueError( + "The `past_key_values` should be either a `Cache` object or `None`." + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + } + # The sliding window alternating layers are not always activated depending on the config + if self.has_sliding_layers: + causal_mask_mapping["sliding_attention"] = ( + create_sliding_window_causal_mask(**mask_kwargs) + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for idx, decoder_layer in enumerate(self.layers): + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@auto_docstring +class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + + # distributed the lm head + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained("meta-qwen2/Qwen2-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-qwen2/Qwen2-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + + layers_to_output_hidden_states = kwargs.pop( + "layers_to_output_hidden_states", None + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + layers_to_output_hidden_states=layers_to_output_hidden_states, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """ +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> SequenceClassifierOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + transformer_outputs: BaseModelOutputWithPast = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + hidden_states = transformer_outputs.last_hidden_state + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to( + logits.device, torch.int32 + ) + token_indices = torch.arange( + input_ids.shape[-1], device=logits.device, dtype=torch.int32 + ) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[ + torch.arange(batch_size, device=logits.device), last_non_pad_token + ] + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + pooled_logits=pooled_logits, + config=self.config, + ) + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +class Qwen2ForTokenClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + if getattr(config, "classifier_dropout", None) is not None: + classifier_dropout = config.classifier_dropout + elif getattr(config, "hidden_dropout", None) is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.score = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> TokenClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPast = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + sequence_output = outputs.last_hidden_state + sequence_output = self.dropout(sequence_output) + logits = self.score(sequence_output) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.config) + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class Qwen2ForQuestionAnswering(Qwen2PreTrainedModel): + base_model_prefix = "transformer" + + def __init__(self, config): + super().__init__(config) + self.transformer = Qwen2Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.transformer.embed_tokens + + def set_input_embeddings(self, value): + self.transformer.embed_tokens = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> QuestionAnsweringModelOutput: + outputs: BaseModelOutputWithPast = self.transformer( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + sequence_output = outputs.last_hidden_state + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + loss = None + if start_positions is not None and end_positions is not None: + loss = self.loss_function( + start_logits, end_logits, start_positions, end_positions, **kwargs + ) + + return QuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "Qwen2PreTrainedModel", + "Qwen2Model", + "Qwen2ForCausalLM", + "Qwen2ForSequenceClassification", + "Qwen2ForTokenClassification", + "Qwen2ForQuestionAnswering", +] diff --git a/idea1/specforge/modeling/target/custom_backend/qwen3.py b/idea1/specforge/modeling/target/custom_backend/qwen3.py new file mode 100644 index 0000000000000000000000000000000000000000..1b0df91f03a3fd74be205cc685ad864f73fd35e8 --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/qwen3.py @@ -0,0 +1,606 @@ +# coding=utf-8 +# Copyright 2025 Qwen Team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +from transformers import Qwen3Config +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.masking_utils import ( + create_causal_mask, + create_sliding_window_causal_mask, +) +from transformers.modeling_flash_attention_utils import FlashAttentionKwargs +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.qwen3.modeling_qwen3 import ( + Qwen3RMSNorm, + apply_rotary_pos_emb, + eager_attention_forward, +) +from transformers.processing_utils import Unpack +from transformers.utils import auto_docstring, can_return_tuple, logging + +from specforge.distributed import get_tp_group +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + +logger = logging.get_logger(__name__) + + +class Qwen3MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + # Add TP support + self.tp_group = get_tp_group() + + self.gate_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=False + ) + self.up_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=False + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, self.hidden_size, bias=False + ) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + # Add all_reduce for TP + dist.all_reduce(down_proj, op=dist.ReduceOp.SUM, group=self.tp_group) + return down_proj + + +class Qwen3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Qwen3Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.total_num_kv_heads = config.num_key_value_heads + self.num_key_value_groups = ( + config.num_attention_heads // config.num_key_value_heads + ) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + # Add TP support + self.tp_group = get_tp_group() + + self.q_proj = ColumnParallelLinear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=config.attention_bias, + ) + self.k_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.v_proj = ColumnParallelLinear( + config.hidden_size, + config.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + self.q_norm = Qwen3RMSNorm( + self.head_dim, eps=config.rms_norm_eps + ) # unlike olmo, only on the head dim! + self.k_norm = Qwen3RMSNorm( + self.head_dim, eps=config.rms_norm_eps + ) # thus post q_norm does not need reshape + # Sliding window logic is kept as is, assuming it's handled in config.layer_types + self.sliding_window = ( + config.sliding_window + if config.layer_types[layer_idx] == "sliding_attention" + else None + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm( + self.q_proj(hidden_states).view(hidden_shape) + ).transpose(1, 2) + key_states = self.k_norm( + self.k_proj(hidden_states).view(hidden_shape) + ).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, # diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + # Add all_reduce for TP + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +class Qwen3DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Qwen3Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx) + + self.mlp = Qwen3MLP(config) + self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen3RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.attention_type = config.layer_types[layer_idx] + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + return outputs + + +class Qwen3RotaryEmbedding(nn.Module): + def __init__(self, config: Qwen3Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get( + "rope_type", config.rope_scaling.get("type") + ) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = ( + self.inv_freq[None, :, None] + .float() + .expand(position_ids.shape[0], -1, 1) + .to(x.device) + ) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = ( + x.device.type + if isinstance(x.device.type, str) and x.device.type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class Qwen3PreTrainedModel(PreTrainedModel): + config_class = Qwen3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen3DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_flex_attn = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, Qwen3RMSNorm): + module.weight.data.fill_(1.0) + + +@auto_docstring +class Qwen3Model(Qwen3PreTrainedModel): + def __init__(self, config: Qwen3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + Qwen3DecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Qwen3RotaryEmbedding(config=config) + self.gradient_checkpointing = False + self.has_sliding_layers = "sliding_attention" in self.config.layer_types + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + layers_to_output_hidden_states = flash_attn_kwargs.pop( + "layers_to_output_hidden_states", None + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + } + # The sliding window alternating layers are not always activated depending on the config + if self.has_sliding_layers: + causal_mask_mapping["sliding_attention"] = ( + create_sliding_window_causal_mask(**mask_kwargs) + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for idx, decoder_layer in enumerate(self.layers): + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@auto_docstring +class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Qwen3Model(config) + self.vocab_size = config.vocab_size + + # Use ColumnParallelLinear for lm_head + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen3ForCausalLM + + >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B") + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/idea1/specforge/modeling/target/custom_backend/qwen3_moe.py b/idea1/specforge/modeling/target/custom_backend/qwen3_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..61f1880f6d3f92ab112388bb7ec991e8535f8600 --- /dev/null +++ b/idea1/specforge/modeling/target/custom_backend/qwen3_moe.py @@ -0,0 +1,889 @@ +# coding=utf-8 +# Copyright 2025 Qwen Team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from transformers import Qwen3MoeConfig +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.generation import GenerationMixin +from transformers.integrations import use_kernel_forward_from_hub +from transformers.masking_utils import ( + create_causal_mask, + create_sliding_window_causal_mask, +) +from transformers.modeling_flash_attention_utils import FlashAttentionKwargs +from transformers.modeling_layers import GradientCheckpointingLayer +from transformers.modeling_outputs import ( + MoeCausalLMOutputWithPast, + MoeModelOutputWithPast, +) +from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from transformers.models.qwen3_moe.modeling_qwen3_moe import ( + apply_rotary_pos_emb, + eager_attention_forward, +) +from transformers.processing_utils import Unpack +from transformers.utils import auto_docstring, can_return_tuple, logging + +from specforge.distributed import get_tp_group +from specforge.layers import ( + ColumnParallelLinear, + ParallelLMHead, + RowParallelLinear, + VocabParallelEmbedding, +) + +logger = logging.get_logger(__name__) + + +class Qwen3MoeAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Qwen3MoeConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + # Add TP support and head calculations + self.tp_group = get_tp_group() + self.tp_size = ( + dist.get_world_size(self.tp_group) if self.tp_group is not None else 1 + ) + self.tp_rank = dist.get_rank(self.tp_group) if self.tp_group is not None else 0 + + # Calculate head distribution for TP + self.total_num_heads = config.num_attention_heads + self.total_num_kv_heads = config.num_key_value_heads + self.num_heads = ( + self.total_num_heads // self.tp_size + ) # this is the number heads per rank + + # Handle KV head replication when tp_size > total_num_kv_heads + if self.tp_size > self.total_num_kv_heads: + # In replication mode, each rank gets 1 KV head (replicated across groups) + self.num_kv_heads = 1 + self.num_kv_head_replicas = self.tp_size // self.total_num_kv_heads + self.num_key_value_groups = ( + self.num_heads // self.num_kv_heads + ) # this is size for expanding kv for gqa + self.kv_head_replicas = True + else: + self.num_kv_heads = self.total_num_kv_heads + self.num_kv_head_replicas = 1 + self.num_key_value_groups = config.num_attention_heads // self.num_kv_heads + self.kv_head_replicas = False + + self.q_proj = ColumnParallelLinear( + config.hidden_size, + config.num_attention_heads * self.head_dim, + bias=config.attention_bias, + ) + self.k_proj = ColumnParallelLinear( + config.hidden_size, + self.num_kv_heads * self.head_dim, + bias=config.attention_bias, + kv_head_replicas=self.kv_head_replicas, + kv_head_idx=self.tp_rank // self.num_kv_head_replicas, + total_num_kv_heads=config.num_key_value_heads, + ) + self.v_proj = ColumnParallelLinear( + config.hidden_size, + self.num_kv_heads * self.head_dim, + bias=config.attention_bias, + kv_head_replicas=self.kv_head_replicas, + kv_head_idx=self.tp_rank // self.num_kv_head_replicas, + total_num_kv_heads=config.num_key_value_heads, + ) + self.o_proj = RowParallelLinear( + config.num_attention_heads * self.head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + + self.q_norm = Qwen3MoeRMSNorm( + self.head_dim, eps=config.rms_norm_eps + ) # unlike olmo, only on the head dim! + self.k_norm = Qwen3MoeRMSNorm( + self.head_dim, eps=config.rms_norm_eps + ) # thus post q_norm does not need reshape + self.sliding_window = getattr(config, "sliding_window", None) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm( + self.q_proj(hidden_states).view(hidden_shape) + ).transpose(1, 2) + key_states = self.k_norm( + self.k_proj(hidden_states).view(hidden_shape) + ).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + self.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, # diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + # Add all_reduce for TP + dist.all_reduce(attn_output, op=dist.ReduceOp.SUM, group=self.tp_group) + return attn_output, attn_weights + + +class Qwen3MoeMLP(nn.Module): + def __init__(self, config, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = ( + intermediate_size + if intermediate_size is not None + else config.intermediate_size + ) + + # Add TP support + self.tp_group = get_tp_group() + self.gate_proj = ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + bias=False, + ) + self.up_proj = ColumnParallelLinear( + self.hidden_size, self.intermediate_size, bias=False + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, self.hidden_size, bias=False + ) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + # Add all_reduce for TP + dist.all_reduce(down_proj, op=dist.ReduceOp.SUM, group=self.tp_group) + return down_proj + + +class Qwen3MoeSparseMoeBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.num_experts = config.num_experts + self.top_k = config.num_experts_per_tok + self.norm_topk_prob = config.norm_topk_prob + + # gating + self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False) + self.experts = nn.ModuleList( + [ + Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size) + for _ in range(self.num_experts) + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ """ + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk( + routing_weights, self.top_k, dim=-1 + ) + if self.norm_topk_prob: # only diff with mixtral sparse moe block! + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot( + selected_experts, num_classes=self.num_experts + ).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hitted: + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0)) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = ( + expert_layer(current_state) * routing_weights[top_x, idx, None] + ) + + # However `index_add_` only support torch tensors for indexing so we'll use + # the `top_x` tensor here. + final_hidden_states.index_add_( + 0, top_x, current_hidden_states.to(hidden_states.dtype) + ) + final_hidden_states = final_hidden_states.reshape( + batch_size, sequence_length, hidden_dim + ) + return final_hidden_states, router_logits + + +@use_kernel_forward_from_hub("RMSNorm") +class Qwen3MoeRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen3MoeRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Qwen3MoeDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Qwen3MoeConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Qwen3MoeAttention(config, layer_idx) + + if (layer_idx not in config.mlp_only_layers) and ( + config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0 + ): + self.mlp = Qwen3MoeSparseMoeBlock(config) + else: + self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size) + + self.input_layernorm = Qwen3MoeRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = Qwen3MoeRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[ + tuple[torch.Tensor, torch.Tensor] + ] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[ + torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, + and should not be returned during inference. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + hidden_states = self.mlp(hidden_states) + if isinstance(hidden_states, tuple): + hidden_states, router_logits = hidden_states + else: + router_logits = None + + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if output_router_logits: + outputs += (router_logits,) + + return outputs + + +class Qwen3MoeRotaryEmbedding(nn.Module): + def __init__(self, config: Qwen3MoeConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get( + "rope_type", config.rope_scaling.get("type") + ) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = ( + self.inv_freq[None, :, None] + .float() + .expand(position_ids.shape[0], -1, 1) + .to(x.device) + ) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = ( + x.device.type + if isinstance(x.device.type, str) and x.device.type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class Qwen3MoePreTrainedModel(PreTrainedModel): + config_class = Qwen3MoeConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen3MoeDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_flex_attn = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported) + _supports_attention_backend = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, Qwen3MoeRMSNorm): + module.weight.data.fill_(1.0) + + +@auto_docstring +class Qwen3MoeModel(Qwen3MoePreTrainedModel): + def __init__(self, config: Qwen3MoeConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + Qwen3MoeDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> MoeModelOutputWithPast: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_router_logits = ( + output_router_logits + if output_router_logits is not None + else self.config.output_router_logits + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + layers_to_output_hidden_states = flash_attn_kwargs.pop( + "layers_to_output_hidden_states", None + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You must specify exactly one of input_ids or inputs_embeds" + ) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = ( + create_causal_mask + if self.config.sliding_window is None + else create_sliding_window_causal_mask + ) + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_router_logits = () if output_router_logits else None + + for idx, decoder_layer in enumerate(self.layers): + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + output_router_logits=output_router_logits, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + if ( + layers_to_output_hidden_states is None + or idx in layers_to_output_hidden_states + ): + all_hidden_states += (hidden_states,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if output_router_logits: + all_router_logits += (layer_outputs[-1],) + + hidden_states = self.norm(hidden_states) + + return MoeModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + router_logits=all_router_logits, + ) + + +def load_balancing_loss_func( + gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], + num_experts: Optional[int] = None, + top_k=2, + attention_mask: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, int]: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + gate_logits: + Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [batch_size X sequence_length, num_experts]. + num_experts: + Number of experts + top_k: + The number of experts to route per-token, can be also interpreted as the `top-k` routing + parameter. + attention_mask (`torch.Tensor`, *optional*): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return 0 + + if isinstance(gate_logits, tuple): + compute_device = gate_logits[0].device + concatenated_gate_logits = torch.cat( + [layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0 + ) + + routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1) + + _, selected_experts = torch.topk(routing_weights, top_k, dim=-1) + + expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts) + + if attention_mask is None: + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.mean(expert_mask.float(), dim=0) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.mean(routing_weights, dim=0) + else: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = concatenated_gate_logits.shape[0] // ( + batch_size * sequence_length + ) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = ( + attention_mask[None, :, :, None, None] + .expand( + (num_hidden_layers, batch_size, sequence_length, top_k, num_experts) + ) + .reshape(-1, top_k, num_experts) + .to(compute_device) + ) + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.sum( + expert_mask.float() * expert_attention_mask, dim=0 + ) / torch.sum(expert_attention_mask, dim=0) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, num_experts)) + .reshape(-1, num_experts) + .to(compute_device) + ) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.sum( + routing_weights * router_per_expert_attention_mask, dim=0 + ) / torch.sum(router_per_expert_attention_mask, dim=0) + + overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0)) + return overall_loss * num_experts + + +@auto_docstring +class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Qwen3MoeModel(config) + self.vocab_size = config.vocab_size + + # Use ColumnParallelLinear for lm_head + self.lm_head = ParallelLMHead(config.hidden_size, config.vocab_size, bias=False) + + self.router_aux_loss_coef = config.router_aux_loss_coef + self.num_experts = config.num_experts + self.num_experts_per_tok = config.num_experts_per_tok + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> MoeCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM + + >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B") + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_router_logits = ( + output_router_logits + if output_router_logits is not None + else self.config.output_router_logits + ) + + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: MoeModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + output_router_logits=output_router_logits, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = ( + slice(-logits_to_keep, None) + if isinstance(logits_to_keep, int) + else logits_to_keep + ) + logits = self.lm_head(hidden_states[:, slice_indices, :], gather_output=True) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + aux_loss = None + if output_router_logits: + aux_loss = load_balancing_loss_func( + outputs.router_logits, + self.num_experts, + self.num_experts_per_tok, + attention_mask, + ) + if labels is not None and aux_loss != 0: + loss += self.router_aux_loss_coef * aux_loss.to( + loss.device + ) # make sure to reside in the same device + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) diff --git a/idea1/specforge/modeling/target/dflash_target_model.py b/idea1/specforge/modeling/target/dflash_target_model.py new file mode 100644 index 0000000000000000000000000000000000000000..0df938239f125b14c8ada68ed456fc78c6011a3d --- /dev/null +++ b/idea1/specforge/modeling/target/dflash_target_model.py @@ -0,0 +1,315 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List, Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +from sglang.srt.configs.model_config import ModelConfig +from sglang.srt.managers.schedule_batch import Req, ScheduleBatch +from sglang.srt.managers.scheduler import Scheduler +from sglang.srt.mem_cache.cache_init_params import CacheInitParams +from sglang.srt.mem_cache.radix_cache import RadixCache +from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardBatch +from sglang.srt.sampling.sampling_params import SamplingParams +from sglang.srt.server_args import ServerArgs +from sglang.srt.speculative.spec_info import SpeculativeAlgorithm +from sglang.srt.utils import require_mlp_sync, require_mlp_tp_gather +from transformers import AutoModelForCausalLM + +from specforge.distributed import get_tp_group + +from .sglang_backend import SGLangRunner + + +@dataclass +class DFlashTargetOutput: + hidden_states: torch.Tensor # [batch, seq_len, hidden_size] + input_ids: torch.Tensor # [batch, seq_len] + attention_mask: torch.Tensor # [batch, seq_len] + loss_mask: torch.Tensor # [batch, seq_len] + + +class DFlashTargetModel(ABC): + """ + Abstract base class for DFlash target model backend. + """ + + def __init__(self): + self.capture_layer_ids = None + + @classmethod + @abstractmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **kwargs, + ) -> "DFlashTargetModel": + """Initialize the target model backend.""" + + @abstractmethod + def generate_dflash_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + ) -> DFlashTargetOutput: + """Generate context hidden states for DFlash training.""" + + def set_capture_layers(self, layer_ids: List[int]) -> None: + """Set which layers' hidden states to capture.""" + self.capture_layer_ids = layer_ids + + +class SGLangDFlashTargetModel(DFlashTargetModel): + def __init__(self, model_runner: SGLangRunner): + super().__init__() + self.model_runner = model_runner + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + trust_remote_code: bool = False, + **kwargs, + ) -> "SGLangDFlashTargetModel": + tp_size = dist.get_world_size(get_tp_group()) + server_args = ServerArgs( + model_path=pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + dtype=torch_dtype, + enable_return_hidden_states=True, # Critical for DFlash + disable_cuda_graph=True, + tp_size=tp_size, + pp_size=1, + **kwargs, + ) + + tp_rank = dist.get_rank(get_tp_group()) + moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size) + model_config = ModelConfig.from_server_args(server_args) + + model_runner = SGLangRunner( + model_config=model_config, + mem_fraction_static=server_args.mem_fraction_static, + gpu_id=torch.cuda.current_device(), + tp_rank=dist.get_rank(get_tp_group()), + tp_size=server_args.tp_size, + moe_ep_rank=moe_ep_rank, + moe_ep_size=server_args.ep_size, + pp_rank=0, + pp_size=1, + server_args=server_args, + nccl_port=None, + ) + return cls(model_runner) + + def set_capture_layers(self, layer_ids: List[int]) -> None: + super().set_capture_layers(layer_ids) + if hasattr(self.model_runner.model, "set_eagle3_layers_to_capture"): + self.model_runner.model.set_eagle3_layers_to_capture(layer_ids) + print(self.model_runner.model.model.layers_to_capture) + + @torch.no_grad + def _extend(self, reqs): + cache_params = CacheInitParams( + disable=False, + req_to_token_pool=self.model_runner.req_to_token_pool, + token_to_kv_pool_allocator=self.model_runner.token_to_kv_pool_allocator, + page_size=self.model_runner.server_args.page_size, + ) + tree_cache = RadixCache(cache_params) + + batch = ScheduleBatch.init_new( + reqs=reqs, + req_to_token_pool=self.model_runner.req_to_token_pool, + token_to_kv_pool_allocator=self.model_runner.token_to_kv_pool_allocator, + tree_cache=tree_cache, + model_config=self.model_runner.model_config, + enable_overlap=False, + spec_algorithm=SpeculativeAlgorithm.NONE, + ) + batch.prepare_for_extend() + + if require_mlp_sync(self.model_runner.server_args): + Scheduler.prepare_mlp_sync_batch_raw( + batch, + dp_size=self.model_runner.server_args.dp_size, + attn_tp_size=1, + tp_group=self.model_runner.tp_group, + get_idle_batch=None, + disable_cuda_graph=self.model_runner.server_args.disable_cuda_graph, + spec_algorithm=SpeculativeAlgorithm.NONE, + speculative_num_draft_tokens=None, + require_mlp_tp_gather=require_mlp_tp_gather( + self.model_runner.server_args + ), + disable_overlap_schedule=self.model_runner.server_args.disable_overlap_schedule, + offload_tags=set(), + ) + + model_worker_batch = batch.get_model_worker_batch() + forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner) + forward_batch.capture_hidden_mode = CaptureHiddenMode.FULL + + output = self.model_runner.forward(forward_batch) + if hasattr(output, "logits_output"): + output = output.logits_output + + input_lens = [len(req.origin_input_ids) for req in reqs] + if ( + hasattr(output, "aux_hidden_states") + and output.aux_hidden_states is not None + ): + hidden_states_list = torch.split( + output.aux_hidden_states, input_lens, dim=0 + ) + elif hasattr(output, "hidden_states") and output.hidden_states is not None: + hidden_states_list = torch.split(output.hidden_states, input_lens, dim=0) + else: + raise ValueError("SGLang output does not contain hidden states.") + + self.model_runner.req_to_token_pool.clear() + self.model_runner.token_to_kv_pool_allocator.clear() + + return hidden_states_list + + @torch.no_grad() + def generate_dflash_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + ) -> DFlashTargetOutput: + sampling_params = SamplingParams(temperature=0, max_new_tokens=1) + reqs, data_cache = [], [] + + if isinstance(input_ids, torch.Tensor): + input_ids_list = torch.split(input_ids, 1, dim=0) + attn_mask_list = torch.split(attention_mask, 1, dim=0) + loss_mask_list = torch.split(loss_mask, 1, dim=0) + + for idx, (curr_ids, curr_attn, curr_loss) in enumerate( + zip(input_ids_list, attn_mask_list, loss_mask_list) + ): + req = Req( + rid=str(idx), + origin_input_text="", + origin_input_ids=curr_ids.view(-1).tolist(), + sampling_params=sampling_params, + ) + req.fill_ids = req.origin_input_ids + req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices) + data_cache.append((curr_ids, curr_attn, curr_loss)) + reqs.append(req) + + hidden_states_list = self._extend(reqs) + + # Stack back to batch + hidden_states = torch.cat([h.unsqueeze(0) for h in hidden_states_list], dim=0) + input_ids = torch.cat([d[0] for d in data_cache], dim=0) + attention_mask = torch.cat([d[1] for d in data_cache], dim=0) + loss_mask = torch.cat([d[2] for d in data_cache], dim=0) + + return DFlashTargetOutput( + hidden_states=hidden_states, + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + ) + + +class HFDFlashTargetModel(DFlashTargetModel): + def __init__(self, model: nn.Module): + super().__init__() + self.model = model + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + trust_remote_code: bool = True, + **kwargs, + ) -> "HFDFlashTargetModel": + + target_model = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + cache_dir=cache_dir, + output_hidden_states=True, + trust_remote_code=trust_remote_code, + **kwargs, + ).eval() + + if device: + target_model = target_model.to(device) + + return cls(target_model) + + @torch.no_grad() + def generate_dflash_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + ) -> DFlashTargetOutput: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + use_cache=False, + ) + + # hidden_states[0] = embedding output; hidden_states[i+1] = layer i output + offset = 1 + selected = [] + if self.capture_layer_ids is not None: + for idx in self.capture_layer_ids: + selected.append(outputs.hidden_states[idx + offset]) + hidden_states = torch.cat(selected, dim=-1) + else: + hidden_states = outputs.hidden_states[-1] + + return DFlashTargetOutput( + hidden_states=hidden_states, + input_ids=input_ids, + attention_mask=attention_mask, + loss_mask=loss_mask, + ) + + +def get_dflash_target_model( + pretrained_model_name_or_path: str, + backend: str = "sglang", + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **kwargs, +) -> DFlashTargetModel: + if backend == "sglang": + return SGLangDFlashTargetModel.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + device=device, + cache_dir=cache_dir, + **kwargs, + ) + elif backend == "hf": + return HFDFlashTargetModel.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + device=device, + cache_dir=cache_dir, + **kwargs, + ) + else: + raise ValueError(f"Invalid backend: {backend}") diff --git a/idea1/specforge/modeling/target/eagle3_target_model.py b/idea1/specforge/modeling/target/eagle3_target_model.py new file mode 100644 index 0000000000000000000000000000000000000000..2acf50ba5321cc7b35b735612d8c32da3a9d6f12 --- /dev/null +++ b/idea1/specforge/modeling/target/eagle3_target_model.py @@ -0,0 +1,873 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import sglang.srt.managers.mm_utils as mm_utils +import torch +import torch.distributed as dist +import torch.nn as nn +from sglang.srt.configs.model_config import ModelConfig +from sglang.srt.layers.rotary_embedding import MRotaryEmbedding +from sglang.srt.managers.mm_utils import ( + MultiModalityDataPaddingPatternMultimodalTokens, + init_mm_embedding_cache, +) +from sglang.srt.managers.schedule_batch import ( + Modality, + MultimodalDataItem, + MultimodalInputs, + Req, + ScheduleBatch, +) + +# - prepare_mlp_sync_batch_raw is now a module-level function, not a Scheduler method +from sglang.srt.managers.scheduler_dp_attn_mixin import prepare_mlp_sync_batch_raw +from sglang.srt.mem_cache.cache_init_params import CacheInitParams +from sglang.srt.mem_cache.radix_cache import RadixCache +from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardBatch +from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor +from sglang.srt.sampling.sampling_params import SamplingParams +from sglang.srt.server_args import ServerArgs +from sglang.srt.speculative.spec_info import SpeculativeAlgorithm +from sglang.srt.utils import require_mlp_sync, require_mlp_tp_gather +from transformers import AutoModelForCausalLM + +from specforge.distributed import get_tp_device_mesh, get_tp_group +from specforge.utils import padding + +from .sglang_backend import SGLangRunner, wrap_eagle3_logits_processors_in_module +from .sglang_backend.utils import LogitsProcessorForEAGLE3 + + +@dataclass +class Eagle3TargetOutput: + hidden_states: torch.Tensor + target: torch.Tensor + loss_mask: torch.Tensor + input_ids: torch.Tensor + attention_mask: torch.Tensor + last_hidden_states: Optional[torch.Tensor] = None + + +class Eagle3TargetModel(ABC): + """ + This offers a layer of abstraction for the target model backend. The user can choose different backends to suit their needs: + 1. SGLang backend: for the mainstream model support with the fastest inference speed + 2. HuggingFace backend: for models that are not supported by SGLang but can be loaded by HuggingFace. + 3. Custom backend: for models with customized architecture and inference plan. + """ + + def __init__(self): + self.aux_hidden_states_layers = None + + @classmethod + @abstractmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **kwargs, + ) -> "Eagle3TargetModel": + """ + Initialize the target model backend from a pretrained model path. + """ + + @abstractmethod + def generate_eagle3_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + ) -> Eagle3TargetOutput: + """ + Generate the eagle3 data from the target model. + """ + + def set_aux_hidden_states_layers( + self, aux_hidden_states_layers: Optional[List[int]] = None + ) -> None: + """ + Set the layers to capture the aux hidden states from the target model outputs. + """ + if aux_hidden_states_layers is None: + if hasattr(self.model.config, "num_hidden_layers"): + num_layers = self.model.config.num_hidden_layers + else: + raise ValueError( + f"Failed to set aux hidden states layers as model config {self.model.config} does not have num_hidden_layers" + ) + aux_hidden_states_layers = [ + 1, + num_layers // 2 - 1, + num_layers - 4, + ] + self.aux_hidden_states_layers = aux_hidden_states_layers + assert ( + len(self.aux_hidden_states_layers) == 3 + ), "aux_hidden_states_layers is expected to be 3 layers for EAGLE3" + + +class HFEagle3TargetModel(Eagle3TargetModel): + + def __init__(self, model: nn.Module): + super().__init__() + self.model = model + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **kwargs, + ) -> "HFEagle3TargetModel": + """ + Initialize the HuggingFace target model backend from a pretrained model path. + """ + tp_size = get_tp_group().size() + + if tp_size > 1: + device_kwargs = { + "tp_plan": "auto", + "tp_size": tp_size, + "device_mesh": get_tp_device_mesh(), + } + else: + device_kwargs = { + "device_map": device, + } + + target_model = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + cache_dir=cache_dir, + **device_kwargs, + **kwargs, + ) + return cls(target_model) + + def _get_transformer_layers(self): + """ + Helper to find the module list containing the transformer layers. + Adapts to common architectures (Llama, Qwen, Mistral, OPT, etc.) + """ + if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): + return self.model.model.layers + elif hasattr(self.model, "layers"): + return self.model.layers + elif hasattr(self.model, "transformer") and hasattr( + self.model.transformer, "h" + ): + return self.model.transformer.h + else: + raise ValueError( + "Could not locate transformer layers in the model architecture to register hooks." + ) + + @torch.no_grad() + def generate_eagle3_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + ) -> Eagle3TargetOutput: + """ + Optimized HF backend: + Instead of returning all hidden states (memory heavy), we use forward hooks + to capture only the specific layers required by Eagle3. + """ + captured_states = {} + handles = [] + + def get_hook(layer_idx): + def hook(module, input, output): + # HF outputs for layers are usually tuples (hidden_states, present_key_value, ...) + # We only need the hidden_states (first element) + if isinstance(output, tuple): + hidden = output[0] + else: + hidden = output + captured_states[layer_idx] = hidden + + return hook + + # Locate the transformer layers ModuleList + layers = self._get_transformer_layers() + + target_indices = self.aux_hidden_states_layers + + # Register hooks + for idx in target_indices: + # Ensure index is within bounds + if 0 <= idx < len(layers): + handles.append(layers[idx].register_forward_hook(get_hook(idx))) + else: + raise ValueError( + f"Layer index {idx} out of bounds for model with {len(layers)} layers." + ) + + try: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=False, + output_attentions=False, + output_router_logits=False, + use_cache=False, + ) + target = outputs.logits + finally: + # Always remove hooks to prevent memory leaks or side effects on subsequent calls + for handle in handles: + handle.remove() + + # Verify we captured everything + if len(captured_states) != 3: + raise RuntimeError( + f"Expected to capture 3 layers, but captured {len(captured_states)}" + ) + + # Extract in the correct order + hidden_states0 = captured_states[target_indices[0]] + hidden_states1 = captured_states[target_indices[1]] + hidden_states2 = captured_states[target_indices[2]] + + hidden_states = torch.cat( + (hidden_states0, hidden_states1, hidden_states2), dim=-1 + ) + + # apply pading + target = outputs.logits + target = padding(target, left=False) + input_ids = padding(input_ids, left=False) + loss_mask = loss_mask[..., None].to(target.device) + + return Eagle3TargetOutput( + hidden_states=hidden_states, + target=target, + loss_mask=loss_mask, + input_ids=input_ids, + attention_mask=attention_mask, + ) + + +class SGLangEagle3TargetModel(Eagle3TargetModel): + + def __init__(self, model_runner: SGLangRunner, hf_config=None): + super().__init__() + self.model_runner = model_runner + self.hf_config = hf_config + + # VLM-specific attributes (initialized from hf_config if available) + self._init_vlm_attributes() + + def _init_vlm_attributes(self): + """Initialize VLM-specific attributes from hf_config for models like Qwen2.5-VL""" + if self.hf_config is None: + self.is_vlm = False + return + + # Check if this is a VLM model by looking for vision_config + self.is_vlm = hasattr(self.hf_config, "vision_config") + + if not self.is_vlm: + return + + init_mm_embedding_cache(1024 * 1024 * 512) + # Model type (e.g., "qwen2_5_vl", "qwen2_vl") + self.model_type = getattr(self.hf_config, "model_type", None) + + # Vision config attributes + vision_config = self.hf_config.vision_config + self.spatial_merge_size = getattr(vision_config, "spatial_merge_size", 2) + self.tokens_per_second = getattr(vision_config, "tokens_per_second", None) + + # Special token IDs from hf_config + self.image_token_id = getattr(self.hf_config, "image_token_id", None) + self.video_token_id = getattr(self.hf_config, "video_token_id", None) + self.vision_start_token_id = getattr( + self.hf_config, "vision_start_token_id", None + ) + self.vision_end_token_id = getattr(self.hf_config, "vision_end_token_id", None) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + trust_remote_code: bool = False, + **kwargs, + ) -> "SGLangEagle3TargetModel": + tp_size = dist.get_world_size(get_tp_group()) + # NOTE: sglang 0.5.9 requires dtype to be non-None + # If torch_dtype is None, use "auto" to let sglang decide the dtype + dtype_arg = torch_dtype if torch_dtype is not None else "auto" + server_args = ServerArgs( + model_path=pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + dtype=dtype_arg, + enable_return_hidden_states=True, + disable_cuda_graph=True, # we use piecewise cuda graph for prefill instead + tp_size=tp_size, + pp_size=1, + **kwargs, + ) + + tp_rank = dist.get_rank(get_tp_group()) + moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size) + model_config = ModelConfig.from_server_args(server_args) + # - Added is_draft_worker=False parameter (new in 0.5.9) + # - Other new parameters (dp_rank, attn_cp_rank, moe_dp_rank, etc.) use default values + model_runner = SGLangRunner( + model_config=model_config, + mem_fraction_static=server_args.mem_fraction_static, + gpu_id=torch.cuda.current_device(), + tp_rank=dist.get_rank(get_tp_group()), + tp_size=server_args.tp_size, + moe_ep_rank=moe_ep_rank, + moe_ep_size=server_args.ep_size, + pp_rank=0, + pp_size=1, + server_args=server_args, + nccl_port=None, + is_draft_worker=False, + ) + wrap_eagle3_logits_processors_in_module( + model_runner.model, return_full_logits=False + ) + + # Get hf_config from model_config for VLM attributes + hf_config = getattr(model_config, "hf_config", None) + + return cls(model_runner, hf_config=hf_config) + + def set_aux_hidden_states_layers( + self, aux_hidden_states_layers: Optional[List[int]] = None + ) -> None: + self.model_runner.model.set_eagle3_layers_to_capture(aux_hidden_states_layers) + + @torch.no_grad + def _extend( + self, + reqs, + capture_aux_hidden_states: bool = True, + return_last_hidden_states: bool = False, + return_logits: bool = False, + ): + # set the logits processor for the model runner + for name, module in self.model_runner.model.named_modules(): + if isinstance(module, LogitsProcessorForEAGLE3): + module.return_last_hidden_states = return_last_hidden_states + module.return_logits = return_logits + + cache_params = CacheInitParams( + disable=False, + req_to_token_pool=self.model_runner.req_to_token_pool, + token_to_kv_pool_allocator=self.model_runner.token_to_kv_pool_allocator, + page_size=self.model_runner.server_args.page_size, + ) + tree_cache = RadixCache(cache_params) + + batch = ScheduleBatch.init_new( + reqs=reqs, + req_to_token_pool=self.model_runner.req_to_token_pool, + token_to_kv_pool_allocator=self.model_runner.token_to_kv_pool_allocator, + tree_cache=tree_cache, + model_config=self.model_runner.model_config, + enable_overlap=False, + spec_algorithm=SpeculativeAlgorithm.NONE, + ) + batch.prepare_for_extend() + self._maybe_prepare_mlp_sync_batch(batch) + model_worker_batch = batch.get_model_worker_batch() + forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner) + forward_batch.capture_hidden_mode = CaptureHiddenMode.FULL + eagle3_output = self.model_runner.forward(forward_batch) + aux_hidden_states_list = None + input_lens = [len(req.origin_input_ids) for req in reqs] + + if return_logits: + if hasattr(eagle3_output, "logits_output"): + raw_logits = eagle3_output.logits_output.logits + else: + raw_logits = eagle3_output.logits + logits = torch.split(raw_logits, input_lens, dim=0) + else: + logits = [None] * len(reqs) + + if capture_aux_hidden_states: + raw_aux_hidden_states = ( + eagle3_output.logits_output.aux_hidden_states + ) # concat hidden shape: (total_tokens, H*3) + aux_hidden_states_list = torch.split( + raw_aux_hidden_states, input_lens, dim=0 + ) + else: + aux_hidden_states_list = [None] * len(reqs) + + if return_last_hidden_states: + last_hidden_states = torch.split( + eagle3_output.logits_output.last_hidden_states, input_lens, dim=0 + ) + else: + last_hidden_states = [None] * len(reqs) + + # TODO: can we not clear? + self.model_runner.req_to_token_pool.clear() + self.model_runner.token_to_kv_pool_allocator.clear() + return logits, aux_hidden_states_list, last_hidden_states + + def _maybe_prepare_mlp_sync_batch(self, batch: ScheduleBatch): + if require_mlp_sync(self.model_runner.server_args): + # - Removed spec_algorithm and speculative_num_draft_tokens parameters + # - Added attn_cp_size parameter + # - Changed from Scheduler.prepare_mlp_sync_batch_raw to direct function call + prepare_mlp_sync_batch_raw( + batch, + dp_size=self.model_runner.server_args.dp_size, + attn_tp_size=1, + attn_cp_size=getattr(self.model_runner.server_args, "attn_cp_size", 1), + tp_group=self.model_runner.tp_group, + get_idle_batch=None, + disable_cuda_graph=self.model_runner.server_args.disable_cuda_graph, + require_mlp_tp_gather=require_mlp_tp_gather( + self.model_runner.server_args + ), + disable_overlap_schedule=self.model_runner.server_args.disable_overlap_schedule, + offload_tags=set(), + ) + + def extend( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + return_last_hidden_states: bool = False, + return_logits: bool = True, + ): + sampling_params = SamplingParams(temperature=0, max_new_tokens=1, top_k=1) + reqs, data_cache = [], [] + + if isinstance(input_ids, torch.Tensor): + input_ids = torch.split(input_ids, 1, dim=0) + attention_mask = torch.split(attention_mask, 1, dim=0) + loss_mask = torch.split(loss_mask, 1, dim=0) + + for idx, (input_id_, attention_mask_, loss_mask_) in enumerate( + zip( + input_ids, + attention_mask, + loss_mask, + ) + ): + req = Req( + rid=str(idx), + origin_input_text="", + origin_input_ids=input_id_.view(-1).tolist(), + sampling_params=sampling_params, + ) + req.fill_ids = req.origin_input_ids + req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices) + req.logprob_start_len = len(req.origin_input_ids) - 1 + data_cache.append([input_id_, attention_mask_, loss_mask_]) + reqs.append(req) + + logits_list, aux_hidden_states_list, last_hidden_states_list = self._extend( + reqs, + capture_aux_hidden_states=True, + return_last_hidden_states=return_last_hidden_states, + return_logits=return_logits, + ) + + return data_cache, logits_list, aux_hidden_states_list, last_hidden_states_list + + def get_rope_index( + self, + input_ids: torch.Tensor, + image_grid_thw: Optional[torch.Tensor] = None, + video_grid_thw: Optional[torch.Tensor] = None, + second_per_grid_ts: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Get M-RoPE position indices for VLM models like Qwen2.5-VL. + + This is a wrapper around MRotaryEmbedding.get_rope_index that uses + the VLM-specific attributes initialized from hf_config. + + Args: + input_ids: (batch_size, seq_len) input token IDs + image_grid_thw: (num_images, 3) image grid dimensions (t, h, w) + video_grid_thw: (num_videos, 3) video grid dimensions (t, h, w) + second_per_grid_ts: Optional temporal information for videos + attention_mask: (batch_size, seq_len) attention mask + + Returns: + position_ids: (3, batch_size, seq_len) M-RoPE position IDs + rope_deltas: Optional position deltas for incremental decoding + """ + if not self.is_vlm: + raise ValueError("get_rope_index is only available for VLM models") + + from sglang.srt.layers.rotary_embedding import MRotaryEmbedding + + position_ids, rope_deltas = MRotaryEmbedding.get_rope_index( + spatial_merge_size=self.spatial_merge_size, + image_token_id=self.image_token_id, + video_token_id=self.video_token_id, + vision_start_token_id=self.vision_start_token_id, + model_type=self.model_type, + input_ids=input_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + attention_mask=attention_mask, + tokens_per_second=self.tokens_per_second, + ) + + return position_ids, rope_deltas + + def extend_vlm( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + return_last_hidden_states: bool = False, + return_logits: bool = True, + pixel_values: Optional[List[torch.Tensor]] = None, + image_grid_thw: Optional[List[torch.Tensor]] = None, + ): + """ + Args: + input_ids: (batch_size, seq_len) or List of (1, seq_len) tensors + attention_mask: (batch_size, seq_len) or List of (1, seq_len) tensors + loss_mask: (batch_size, seq_len) or List of (1, seq_len) tensors + pixel_values: List of pixel_values tensors, one per sample in batch + image_grid_thw: List of image_grid_thw tensors, one per sample in batch + """ + mm_utils.embedding_cache.clear() + sampling_params = SamplingParams(temperature=0, max_new_tokens=1, top_k=1) + reqs, data_cache = [], [] + + # Split tensors if needed + if isinstance(input_ids, torch.Tensor): + batch_size = input_ids.shape[0] + input_ids = torch.split(input_ids, 1, dim=0) + attention_mask = torch.split(attention_mask, 1, dim=0) + loss_mask = torch.split(loss_mask, 1, dim=0) + else: + batch_size = len(input_ids) + # Process image_grid_thw - convert to list if needed + if image_grid_thw is None: + image_grid_thw = [None] * batch_size + elif not isinstance(image_grid_thw, (list, tuple)): + image_grid_thw = [image_grid_thw] + + # pixel_values is a single 2D tensor (total_patches, patch_dim) for Qwen2.5-VL + # We need to track offset and slice it based on image_grid_thw for each sample + pixel_values_offset = 0 # Track current offset in pixel_values + + for idx, (input_id_, attention_mask_, loss_mask_, image_grid_thw_) in enumerate( + zip( + input_ids, + attention_mask, + loss_mask, + image_grid_thw, + ) + ): + # Compute num_patches for this sample from image_grid_thw_ + # image_grid_thw_: (num_images, 3) where each row is (t, h, w) + if image_grid_thw_ is not None: + # Ensure image_grid_thw_ is 2D: (num_images, 3) + if image_grid_thw_.dim() == 1: + image_grid_thw_ = image_grid_thw_.unsqueeze(0) # (3,) -> (1, 3) + elif image_grid_thw_.dim() == 0: + raise ValueError( + f"image_grid_thw_ is 0-dim tensor, expected at least 1D. Value: {image_grid_thw_}" + ) + + # Calculate num_patches for this sample: sum(t * h * w) for all images + num_patches = ( + ( + image_grid_thw_[:, 0] + * image_grid_thw_[:, 1] + * image_grid_thw_[:, 2] + ) + .sum() + .item() + ) + num_patches = int(num_patches) + + # Slice pixel_values for this sample + pixel_value_ = pixel_values[ + pixel_values_offset : pixel_values_offset + num_patches + ] + pixel_values_offset += num_patches + else: + pixel_value_ = None + num_patches = 0 + + # Compute mrope positions for VLM models (e.g., Qwen2.5-VL) + input_id_flat = input_id_.view(-1) + + # Count image tokens + num_img_tokens = (input_id_flat == self.image_token_id).sum().item() + # print(f"[extend_vlm] num_img_tokens in input_ids: {num_img_tokens}") + + mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index( + spatial_merge_size=self.spatial_merge_size, + image_token_id=self.image_token_id, + video_token_id=self.video_token_id, + vision_start_token_id=self.vision_start_token_id, + model_type=self.model_type, + input_ids=input_id_flat.unsqueeze(0).cpu(), + image_grid_thw=( + image_grid_thw_.cpu() if image_grid_thw_ is not None else None + ), + tokens_per_second=self.tokens_per_second, + ) + + offset = BaseMultimodalProcessor.get_mm_items_offset( + input_id_flat, self.image_token_id + ) + mm_item = MultimodalDataItem( + modality=Modality.IMAGE, + feature=pixel_value_, # torch.Tensor: (num_patches, patch_dim) + pad_value=self.image_token_id, # Required for placeholder tensor creation + offsets=offset, # List of (start, end) tuples + ) + mm_item.set("image_grid_thw", image_grid_thw_.cpu()) + mm_item.set_pad_value() + mm_inputs = MultimodalInputs( + mm_items=[mm_item], + im_token_id=self.image_token_id, + im_start_id=self.vision_start_token_id, + im_end_id=self.vision_end_token_id, + mrope_positions=( + mrope_positions.squeeze(1) if mrope_positions is not None else None + ), + mrope_position_delta=mrope_position_delta, + ) + pattern = MultiModalityDataPaddingPatternMultimodalTokens() + input_id_list = pattern.pad_input_tokens( + input_id_.view(-1).tolist(), mm_inputs + ) + req = Req( + rid=str(idx), + origin_input_text="", + origin_input_ids=input_id_list, + sampling_params=sampling_params, + ) + req.fill_ids = req.origin_input_ids + req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices) + req.logprob_start_len = len(req.origin_input_ids) - 1 + req.multimodal_inputs = mm_inputs + data_cache.append([input_id_, attention_mask_, loss_mask_]) + reqs.append(req) + + logits_list, aux_hidden_states_list, last_hidden_states_list = self._extend( + reqs, + capture_aux_hidden_states=True, + return_last_hidden_states=return_last_hidden_states, + return_logits=return_logits, + ) + + return data_cache, logits_list, aux_hidden_states_list, last_hidden_states_list + + @torch.no_grad() + def generate_eagle3_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + is_vlm: bool = False, + ) -> Eagle3TargetOutput: + """ + return: + data_for_draft: List[Dict[str, torch.Tensor]] of draft_batch_size, draft_micro_batch_size = 1 + - input_ids: (1, seq_len) + - attention_mask: (1, seq_len) + - loss_mask: (1, seq_len) + - target: (1, seq_len, vocab_size) or (1, seq_len, hidden_size) + - hidden_states: (1, seq_len, hidden_size) + - pixel_values: (patch_len, patch_width) + - image_grid_thw (batch_size, 3) + """ + if is_vlm: + data_cache, logits_list, aux_hidden_states_list, last_hidden_states_list = ( + self.extend_vlm( + input_ids, + attention_mask, + loss_mask, + return_last_hidden_states=False, + return_logits=True, + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + ) + else: + data_cache, logits_list, aux_hidden_states_list, last_hidden_states_list = ( + self.extend( + input_ids, + attention_mask, + loss_mask, + return_last_hidden_states=False, + return_logits=True, + ) + ) + aux_hidden_states_out = [] + target_out = [] + loss_mask_out = [] + input_ids_out = [] + last_hidden_states_out = [] + + for idx, (data, logits, aux_hidden_states, last_hidden_states) in enumerate( + zip( + data_cache, logits_list, aux_hidden_states_list, last_hidden_states_list + ) + ): + aux_hidden_states_out.append(aux_hidden_states.unsqueeze(0)) + loss_mask_out.append(data[2]) + input_ids_out.append(data[0]) + + # when generating hidden states for offline training, we don't compute logits and only keep the last_hidden_states + # when training online, we don't keep the last_hidden_states and only keep the logits + if logits is not None: + target_out.append(logits.unsqueeze(0)) + else: + target_out.append(None) + + if last_hidden_states is not None: + last_hidden_states_out.append(last_hidden_states.unsqueeze(0)) + else: + last_hidden_states_out.append(None) + + aux_hidden_states_out = torch.cat(aux_hidden_states_out, dim=0) + + loss_mask_out = torch.cat(loss_mask_out, dim=0) + input_ids_out = torch.cat(input_ids_out, dim=0) + + if target_out[0] is not None: + target_out = torch.cat(target_out, dim=0) + else: + target_out = None + + if last_hidden_states_out[0] is not None: + last_hidden_states_out = torch.cat(last_hidden_states_out, dim=0) + else: + last_hidden_states_out = None + + target_out = padding(target_out, left=False) + input_ids_out = padding(input_ids_out, left=False) + loss_mask_out = loss_mask_out[..., None] + + return Eagle3TargetOutput( + hidden_states=aux_hidden_states_out, + target=target_out, + loss_mask=loss_mask_out, + input_ids=input_ids_out, + attention_mask=attention_mask, + last_hidden_states=last_hidden_states_out, + ) + + +class CustomEagle3TargetModel(Eagle3TargetModel): + + def __init__(self, model: nn.Module): + super().__init__() + self.model = model + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **kwargs, + ) -> "CustomEagle3TargetModel": + from specforge.modeling.auto import AutoDistributedTargetModel + + target_model = AutoDistributedTargetModel.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + cache_dir=cache_dir, + device=device, + **kwargs, + ) + return cls(target_model) + + @torch.no_grad() + def generate_eagle3_data( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + loss_mask: torch.Tensor, + ) -> Eagle3TargetOutput: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + layers_to_output_hidden_states=self.aux_hidden_states_layers, + use_cache=False, + ) + + # For custom backends, the model implementation is responsible for only + # returning the requested layers in `outputs.hidden_states`. + hidden_states = torch.cat(outputs.hidden_states, dim=-1) + + target = outputs.logits + target = padding(target, left=False) + input_ids = padding(input_ids, left=False) + loss_mask = loss_mask[..., None].to(target.device) + + return Eagle3TargetOutput( + hidden_states=hidden_states, + target=target, + loss_mask=loss_mask, + input_ids=input_ids, + attention_mask=attention_mask, + ) + + +def get_eagle3_target_model( + pretrained_model_name_or_path: str, + backend: str = "sglang", + torch_dtype: torch.dtype = None, + device: str = None, + cache_dir: Optional[str] = None, + **kwargs, +) -> Eagle3TargetModel: + if backend == "sglang": + return SGLangEagle3TargetModel.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + device=device, + cache_dir=cache_dir, + **kwargs, + ) + elif backend == "hf": + return HFEagle3TargetModel.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + device=device, + cache_dir=cache_dir, + **kwargs, + ) + elif backend == "custom": + return CustomEagle3TargetModel.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + device=device, + cache_dir=cache_dir, + **kwargs, + ) + else: + raise ValueError(f"Invalid backend: {backend}") diff --git a/idea1/specforge/modeling/target/sglang_backend/__init__.py b/idea1/specforge/modeling/target/sglang_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0e02ab7b3950bf2405141a61a89c071f42a9a2a7 --- /dev/null +++ b/idea1/specforge/modeling/target/sglang_backend/__init__.py @@ -0,0 +1,4 @@ +from .model_runner import SGLangRunner +from .utils import wrap_eagle3_logits_processors_in_module + +__all__ = ["SGLangRunner", "wrap_eagle3_logits_processors_in_module"] diff --git a/idea1/specforge/modeling/target/sglang_backend/__pycache__/__init__.cpython-311.pyc b/idea1/specforge/modeling/target/sglang_backend/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..577c00b4121181e9c666617669ce3b93adec5401 Binary files /dev/null and b/idea1/specforge/modeling/target/sglang_backend/__pycache__/__init__.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/sglang_backend/__pycache__/model_runner.cpython-311.pyc b/idea1/specforge/modeling/target/sglang_backend/__pycache__/model_runner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08fc6510dbb312381bd53ea11bb1a44fc738241c Binary files /dev/null and b/idea1/specforge/modeling/target/sglang_backend/__pycache__/model_runner.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/sglang_backend/__pycache__/patch.cpython-311.pyc b/idea1/specforge/modeling/target/sglang_backend/__pycache__/patch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13d4051850628bebfb2526424b9fbce25351034e Binary files /dev/null and b/idea1/specforge/modeling/target/sglang_backend/__pycache__/patch.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/sglang_backend/__pycache__/utils.cpython-311.pyc b/idea1/specforge/modeling/target/sglang_backend/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4732033f69487649122117efae5412d0437514fb Binary files /dev/null and b/idea1/specforge/modeling/target/sglang_backend/__pycache__/utils.cpython-311.pyc differ diff --git a/idea1/specforge/modeling/target/sglang_backend/model_runner.py b/idea1/specforge/modeling/target/sglang_backend/model_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..501ee34e6b36b99a97b87226d8fc40cdfa5c9d36 --- /dev/null +++ b/idea1/specforge/modeling/target/sglang_backend/model_runner.py @@ -0,0 +1,173 @@ +import logging +import os + +import torch +from sglang.srt.distributed import ( + get_pp_group, + get_tp_group, + get_world_group, + set_custom_all_reduce, + set_mscclpp_all_reduce, + set_torch_symm_mem_all_reduce, +) +from sglang.srt.layers.dp_attention import ( + get_attention_tp_group, + initialize_dp_attention, +) +from sglang.srt.model_executor.model_runner import ModelRunner +from sglang.srt.utils import ( + cpu_has_amx_support, + get_available_gpu_memory, + get_bool_env_var, + is_hip, + is_npu, + monkey_patch_p2p_access_check, +) + +from .patch import ( + init_distributed_environment, + initialize_dp_attention, + initialize_model_parallel, +) + +_is_hip = is_hip() +_is_npu = is_npu() +_is_cpu_amx_available = cpu_has_amx_support() + +# Use a small KV cache pool size for tests in CI +SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None) + +# Detect stragger ranks in model loading +UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300 + +logger = logging.getLogger(__name__) + + +class SGLangRunner(ModelRunner): + + def init_torch_distributed(self): + logger.info("Init torch distributed begin.") + + try: + torch.get_device_module(self.device).set_device(self.gpu_id) + except Exception: + logger.warning( + f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}" + ) + raise + + if self.device == "cuda": + if self.server_args.elastic_ep_backend == "mooncake": + backend = "mooncake" + if self.server_args.mooncake_ib_device: + mooncake_ib_device = self.server_args.mooncake_ib_device.split(",") + try: + from mooncake import ep as mooncake_ep + + mooncake_ep.set_device_filter(mooncake_ib_device) + except: + pass # A warning will be raised in `init_distributed_environment` + else: + backend = "nccl" + elif self.device == "xpu": + backend = "xccl" + elif self.device == "hpu": + backend = "hccl" + elif self.device == "cpu": + backend = "gloo" + elif self.device == "npu": + backend = "hccl" + + before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) + if not self.server_args.enable_p2p_check: + monkey_patch_p2p_access_check() + + if self.server_args.dist_init_addr: + dist_init_method = f"tcp://{self.server_args.dist_init_addr}" + else: + dist_init_method = f"tcp://127.0.0.1:{self.dist_port}" + set_custom_all_reduce(not self.server_args.disable_custom_all_reduce) + set_mscclpp_all_reduce(self.server_args.enable_mscclpp) + set_torch_symm_mem_all_reduce(self.server_args.enable_torch_symm_mem) + + if not self.is_draft_worker: + if self.device == "cpu": + if _is_cpu_amx_available: + # Bind OpenMP threads to CPU cores + torch.ops.sgl_kernel.init_cpu_threads_env(self.local_omp_cpuid) + + # Set local size to hint SGLang to use shared memory based AllReduce + os.environ["LOCAL_SIZE"] = str(self.tp_size) + torch.ops.sgl_kernel.initialize(self.tp_size, self.tp_rank) + + @torch.library.register_fake("sgl_kernel::shm_allgather") + def _(data, dim): + return torch.cat([data] * self.tp_size, dim=dim) + + else: + logger.warning( + "init_cpu_threads_env and shared memory based AllReduce is disabled since intel amx backend is not available" + ) + + # Only initialize the distributed environment on the target model worker. + init_distributed_environment( + backend=backend, + world_size=self.tp_size * self.pp_size, + rank=self.tp_size * self.pp_rank + self.tp_rank, + local_rank=self.gpu_id, + ) + # NOTE: Updated for sglang 0.5.9 + # - Removed torch_compile parameter (no longer supported) + # - Added new parameters: attention_data_parallel_size, attention_context_model_parallel_size, moe_data_model_parallel_size + + # Debug: Print the values + dp_size = getattr(self.server_args, "dp_size", 1) + attn_cp_size = getattr(self.server_args, "attn_cp_size", 1) + moe_dp_size = getattr(self.server_args, "moe_dp_size", 1) + print( + f"[DEBUG] tp_size={self.tp_size}, dp_size={dp_size}, attn_cp_size={attn_cp_size}, moe_dp_size={moe_dp_size}" + ) + + initialize_model_parallel( + tensor_model_parallel_size=self.tp_size, + pipeline_model_parallel_size=self.pp_size, + expert_model_parallel_size=self.moe_ep_size, + attention_data_parallel_size=dp_size, + attention_context_model_parallel_size=attn_cp_size, + moe_data_model_parallel_size=moe_dp_size, + duplicate_tp_group=self.server_args.enable_pdmux, + ) + initialize_dp_attention( + server_args=self.server_args, + model_config=self.model_config, + ) + + min_per_gpu_memory = get_available_gpu_memory( + self.device, + self.gpu_id, + distributed=get_world_group().world_size > 1, + cpu_group=get_world_group().cpu_group, + ) + self.tp_group = get_tp_group() + self.pp_group = get_pp_group() + self.attention_tp_group = get_attention_tp_group() + + # Check memory for tensor parallelism + local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id) + if self.tp_size > 1 and not self.is_draft_worker: + if min_per_gpu_memory < local_gpu_memory * 0.9: + if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"): + logger.warning( + "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. " + f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}" + ) + else: + raise ValueError( + "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. " + f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}" + ) + + logger.info( + f"Init torch distributed ends. mem usage={(before_avail_memory - local_gpu_memory):.2f} GB" + ) + return min_per_gpu_memory diff --git a/idea1/specforge/modeling/target/sglang_backend/patch.py b/idea1/specforge/modeling/target/sglang_backend/patch.py new file mode 100644 index 0000000000000000000000000000000000000000..1ec608f8e871b2af844acbbfd935e3dc1089c21c --- /dev/null +++ b/idea1/specforge/modeling/target/sglang_backend/patch.py @@ -0,0 +1,390 @@ +import logging +from typing import Optional + +import sglang.srt.distributed.parallel_state as parallel_state +import torch +import torch.distributed as dist +from sglang.srt.configs.model_config import ModelConfig +from sglang.srt.distributed import init_model_parallel_group +from sglang.srt.distributed.parallel_state import GroupCoordinator +from sglang.srt.layers.dp_attention import ( + _DpGatheredBufferWrapper, + compute_dp_attention_local_info, + compute_dp_attention_world_info, +) +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import get_bool_env_var + +from specforge.distributed import get_tp_group as get_specforge_tp_group + +logger = logging.getLogger(__name__) + + +def init_distributed_environment( + world_size: int = -1, + rank: int = -1, + local_rank: int = -1, + backend: str = "nccl", +): + logger.debug( + "world_size=%d rank=%d backend=%s", + world_size, + rank, + backend, + ) + assert ( + torch.distributed.is_initialized() + ), "distributed environment should be initialized first" + + tp_group = get_specforge_tp_group() + world_size = dist.get_world_size() + tp_size = dist.get_world_size(tp_group) + num_tp_groups = world_size // tp_size + tp_ranks = [] + for i in range(num_tp_groups): + tp_ranks.append(list(range(i * tp_size, (i + 1) * tp_size))) + + parallel_state._WORLD = GroupCoordinator( + group_ranks=tp_ranks, + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=False, + use_pymscclpp=False, + use_custom_allreduce=False, + use_torch_symm_mem_all_reduce=False, + use_hpu_communicator=False, + use_xpu_communicator=False, + use_npu_communicator=False, + group_name="world", + ) + # we destroy the newly created world group and replace it + # with the existing tp group from specforge to save CUDA memory + group_to_destroy = parallel_state._WORLD.device_group + parallel_state._WORLD.device_group = tp_group + dist.destroy_process_group(group_to_destroy) + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + expert_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + attention_data_parallel_size: int = 1, + attention_context_model_parallel_size: int = 1, + moe_data_model_parallel_size: int = 1, + backend: Optional[str] = None, + duplicate_tp_group: bool = False, + # NOTE: torch_compile parameter was removed in sglang 0.5.9 + # torch_compile: Optional[bool] = None, +) -> None: + """ + Initialize model parallel groups. + + Arguments: + tensor_model_parallel_size: number of GPUs used for tensor model + parallelism. + pipeline_model_parallel_size: number of GPUs used for pipeline model + parallelism. + attention_data_parallel_size: number of GPUs used for attention data + parallelism. (Added in sglang 0.5.9) + attention_context_model_parallel_size: number of GPUs used for attention context + parallelism. (Added in sglang 0.5.9) + moe_data_model_parallel_size: number of GPUs used for moe data + parallelism. (Added in sglang 0.5.9) + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: + 4 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 pipeline model-parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size: int = parallel_state._WORLD.world_size + backend = backend or dist.get_backend(parallel_state._WORLD.device_group) + + if world_size != tensor_model_parallel_size * pipeline_model_parallel_size: + raise RuntimeError( + f"world_size ({world_size}) is not equal to " + f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " + f"pipeline_model_parallel_size ({pipeline_model_parallel_size})" + ) + + # Build the tensor model-parallel groups. + num_tensor_model_parallel_groups: int = ( + dist.get_world_size() // tensor_model_parallel_size + ) + assert ( + parallel_state._TP is None + ), "tensor model parallel group is already initialized" + group_ranks = [] + for i in range(num_tensor_model_parallel_groups): + ranks = list( + range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) + ) + group_ranks.append(ranks) + + # message queue broadcaster is only used in tensor model parallel group + # NOTE: torch_compile parameter was removed in sglang 0.5.9 + parallel_state._TP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + use_message_queue_broadcaster=get_bool_env_var( + "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true" + ), + group_name="tp", + pynccl_use_current_stream=duplicate_tp_group, + ) + + if duplicate_tp_group: + assert ( + parallel_state._PDMUX_PREFILL_TP_GROUP is None + ), "tensor model parallel group for PD-Multiplexing Prefill is already initialized" + # NOTE: torch_compile parameter was removed in sglang 0.5.9 + parallel_state._PDMUX_PREFILL_TP_GROUP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + use_message_queue_broadcaster=get_bool_env_var( + "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true" + ), + group_name="pdmux_prefill_tp", + pynccl_use_current_stream=True, + ) + # NOTE: Check pynccl_comm exists before accessing it (may be None in sglang 0.5.9) + if parallel_state._TP.pynccl_comm is not None: + parallel_state._TP.pynccl_comm.disabled = False + if parallel_state._PDMUX_PREFILL_TP_GROUP.pynccl_comm is not None: + parallel_state._PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False + + moe_ep_size = expert_model_parallel_size + + moe_tp_size = tensor_model_parallel_size // moe_ep_size + assert ( + parallel_state._MOE_EP is None + ), "expert model parallel group is already initialized" + group_ranks = [] + for i in range(num_tensor_model_parallel_groups): + for j in range(moe_tp_size): + st = i * tensor_model_parallel_size + j + en = (i + 1) * tensor_model_parallel_size + j + ranks = list(range(st, en, moe_tp_size)) + group_ranks.append(ranks) + + parallel_state._MOE_EP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + use_custom_allreduce=False, + group_name="moe_ep", + ) + + assert ( + parallel_state._MOE_TP is None + ), "moe tensor model parallel group is already initialized" + if moe_ep_size == 1: + parallel_state._MOE_TP = parallel_state._TP + else: + group_ranks = [] + for i in range(num_tensor_model_parallel_groups): + for j in range(moe_ep_size): + st = i * tensor_model_parallel_size + j * moe_tp_size + en = i * tensor_model_parallel_size + (j + 1) * moe_tp_size + ranks = list(range(st, en)) + group_ranks.append(ranks) + parallel_state._MOE_TP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + use_custom_allreduce=False, + group_name="moe_tp", + ) + + # Build the pipeline model-parallel groups. + num_pipeline_model_parallel_groups: int = ( + dist.get_world_size() // pipeline_model_parallel_size + ) + assert ( + parallel_state._PP is None + ), "pipeline model parallel group is already initialized" + group_ranks = [] + for i in range(num_pipeline_model_parallel_groups): + ranks = list( + range(i, dist.get_world_size(), num_pipeline_model_parallel_groups) + ) + group_ranks.append(ranks) + # pipeline parallel does not need custom allreduce + parallel_state._PP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + use_custom_allreduce=False, + group_name="pp", + ) + + # NOTE: Added for sglang 0.5.9 - Initialize attention parallel groups + # These are required by get_attention_tp_group() and get_attention_cp_group() + from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP + + attn_dp_size = attention_data_parallel_size + attn_cp_size = attention_context_model_parallel_size + attn_tp_size = tensor_model_parallel_size // attn_cp_size // attn_dp_size + + # Initialize _ATTN_CP (attention context parallel group) + if not hasattr(parallel_state, "_ATTN_CP"): + parallel_state._ATTN_CP = None + assert ( + parallel_state._ATTN_CP is None + ), "attention context model parallel group is already initialized" + if attn_cp_size == tensor_model_parallel_size: + parallel_state._ATTN_CP = parallel_state._TP + else: + group_ranks = [] + for tp_group_idx in range(num_tensor_model_parallel_groups): + for dp_idx in range(attn_dp_size): + for attn_tp_idx in range(attn_tp_size): + st = ( + tp_group_idx * tensor_model_parallel_size + + dp_idx * attn_tp_size * attn_cp_size + + attn_tp_idx + ) + en = ( + tp_group_idx * tensor_model_parallel_size + + (dp_idx + 1) * attn_tp_size * attn_cp_size + + attn_tp_idx + ) + ranks = list(range(st, en, attn_tp_size)) + group_ranks.append(ranks) + parallel_state._ATTN_CP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + group_name="attn_cp", + ) + + # Initialize _ATTN_TP (attention tensor parallel group) + if not hasattr(parallel_state, "_ATTN_TP"): + parallel_state._ATTN_TP = None + assert ( + parallel_state._ATTN_TP is None + ), "attention tensor model parallel group is already initialized" + if attn_tp_size == tensor_model_parallel_size: + parallel_state._ATTN_TP = parallel_state._TP + else: + group_ranks = [] + for tp_group_idx in range(num_tensor_model_parallel_groups): + for cp_dp_combined_idx in range(attn_cp_size * attn_dp_size): + st = ( + tp_group_idx * tensor_model_parallel_size + + cp_dp_combined_idx * attn_tp_size + ) + en = ( + tp_group_idx * tensor_model_parallel_size + + (cp_dp_combined_idx + 1) * attn_tp_size + ) + ranks = list(range(st, en)) + group_ranks.append(ranks) + parallel_state._ATTN_TP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + use_pynccl=SYNC_TOKEN_IDS_ACROSS_TP, + use_mscclpp_allreduce=False, + use_custom_allreduce=False, + use_torch_symm_mem_allreduce=False, + group_name="attention_tp", + ) + + # Initialize _MOE_DP (moe data parallel group) + if not hasattr(parallel_state, "_MOE_DP"): + parallel_state._MOE_DP = None + assert ( + parallel_state._MOE_DP is None + ), "moe data parallel group is already initialized" + moe_dp_size = moe_data_model_parallel_size + moe_tp_size_for_dp = tensor_model_parallel_size // moe_ep_size // moe_dp_size + if moe_dp_size == tensor_model_parallel_size: + parallel_state._MOE_DP = parallel_state._TP + else: + group_ranks = [] + for tp_group_idx in range(num_tensor_model_parallel_groups): + for tp_ep_combined_idx in range(moe_tp_size_for_dp * moe_ep_size): + st = tp_group_idx * tensor_model_parallel_size + tp_ep_combined_idx + en = ( + tp_group_idx + 1 + ) * tensor_model_parallel_size + tp_ep_combined_idx + ranks = list(range(st, en, moe_tp_size_for_dp * moe_ep_size)) + group_ranks.append(ranks) + parallel_state._MOE_DP = init_model_parallel_group( + group_ranks, + parallel_state._WORLD.local_rank, + backend, + group_name="moe_dp", + ) + + +def initialize_dp_attention( + server_args: ServerArgs, + model_config: ModelConfig, +): + """ + Initialize data parallel attention. + + Updated for sglang 0.5.9: + - Added attn_cp_size parameter support + - Removed _ATTN_TP_GROUP creation (now handled by initialize_model_parallel in sglang 0.5.9) + """ + import sglang.srt.layers.dp_attention as dp_attention + + enable_dp_attention = server_args.enable_dp_attention + tp_size = server_args.tp_size + dp_size = server_args.dp_size + moe_dense_tp_size = server_args.moe_dense_tp_size + pp_size = server_args.pp_size + # NOTE: attn_cp_size is new in sglang 0.5.9 + attn_cp_size = getattr(server_args, "attn_cp_size", 1) + + tp_rank = parallel_state.get_tensor_model_parallel_rank() + + dp_attention._ENABLE_DP_ATTENTION_FLAG = enable_dp_attention + + # NOTE: Added attn_cp_size parameter for sglang 0.5.9 + ( + dp_attention._ATTN_TP_RANK, + dp_attention._ATTN_TP_SIZE, + dp_attention._ATTN_DP_RANK, + ) = compute_dp_attention_world_info( + enable_dp_attention, tp_rank, tp_size, dp_size, attn_cp_size + ) + _, _, dp_attention._LOCAL_ATTN_DP_RANK = compute_dp_attention_local_info( + enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size + ) + + if enable_dp_attention: + dp_attention._ATTN_DP_SIZE = dp_size + if moe_dense_tp_size is None: + dp_attention._LOCAL_ATTN_DP_SIZE = dp_attention._ATTN_DP_SIZE + else: + dp_attention._LOCAL_ATTN_DP_SIZE = max( + 1, dp_size // (tp_size // moe_dense_tp_size) + ) + else: + dp_attention._ATTN_DP_SIZE = 1 + dp_attention._LOCAL_ATTN_DP_SIZE = 1 + + # NOTE: In sglang 0.5.9, _ATTN_TP_GROUP is created in initialize_model_parallel. + # We no longer need to manually create it here to avoid conflicts. + # The assertion error occurs because we were trying to recreate an already-initialized group. + + _DpGatheredBufferWrapper.set_metadata( + hidden_size=model_config.hidden_size, + dtype=model_config.dtype, + device=torch.device(server_args.device), + ) diff --git a/idea1/specforge/modeling/target/sglang_backend/utils.py b/idea1/specforge/modeling/target/sglang_backend/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..87d384bffece2d757d022baa2380e6ce4c89993a --- /dev/null +++ b/idea1/specforge/modeling/target/sglang_backend/utils.py @@ -0,0 +1,173 @@ +""" +This file contains the wrapper for the SGL model. +""" + +from dataclasses import dataclass +from typing import List, Optional, Union + +import torch +import torch.nn as nn +from sglang.srt.layers.logits_processor import ( + LogitsMetadata, + LogitsProcessor, + LogitsProcessorOutput, +) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode +from sglang.srt.server_args import get_global_server_args + + +@dataclass +class ReplacedLogitsProcessorEagle3Output: + """ + A dataclass to store the logits and aux hidden states needed for EAGLE3. + """ + + logits: torch.Tensor + aux_hidden_states: torch.Tensor + last_hidden_states: Optional[torch.Tensor] = None + + +def replaced_logits_processor_forward_for_eagle3( + self, + input_ids, + hidden_states, + lm_head, + logits_metadata: Union[LogitsMetadata, ForwardBatch], + aux_hidden_states: Optional[List[torch.Tensor]] = None, + hidden_states_before_norm: Optional[torch.Tensor] = None, + return_last_hidden_states: bool = False, + return_logits: bool = False, +) -> LogitsProcessorOutput: + """ + This is a modified forward function for the SGLang's logits processor, adapted from https://github.com/sgl-project/sglang/blob/v0.5.4/python/sglang/srt/layers/logits_processor.py. + The modification is to return the logits and aux hidden states instead of the last hidden states. + + Updated for sglang 0.5.9: + - Added hidden_states_before_norm parameter for compatibility + """ + + if isinstance(logits_metadata, ForwardBatch): + logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata) + + # Check if multi-item scoring is enabled via server args (only for prefill-only requests) + multi_item_delimiter = get_global_server_args().multi_item_scoring_delimiter + if multi_item_delimiter is not None and logits_metadata.is_prefill_only: + return self.compute_logprobs_for_multi_item_scoring( + input_ids, hidden_states, lm_head, logits_metadata, multi_item_delimiter + ) + + # Get the last hidden states and last logits for the next token prediction + if ( + logits_metadata.forward_mode.is_decode_or_idle() + or logits_metadata.forward_mode.is_target_verify() + or logits_metadata.forward_mode.is_draft_extend_v2() + ): + pruned_states = hidden_states + if aux_hidden_states is not None: + aux_pruned_states = [hidden for hidden in aux_hidden_states] + else: + aux_pruned_states = None + sample_indices = None + input_logprob_indices = None + else: + raise RuntimeError( + f"The modified logits processor is not supported for this forward mode: {logits_metadata.forward_mode}" + ) + + if return_last_hidden_states: + last_hidden_states = pruned_states + else: + last_hidden_states = None + + if return_logits: + # Compute logits for both input and sampled tokens. + logits = self._get_logits(pruned_states, lm_head, logits_metadata) + else: + logits = None + + # get the aux hidden states + hidden_states_to_store: Optional[torch.Tensor] = None + if logits_metadata.capture_hidden_mode.need_capture(): + if logits_metadata.capture_hidden_mode.is_full(): + if aux_hidden_states is not None: + aux_hidden_states = torch.cat(aux_hidden_states, dim=-1) + hidden_states_to_store = aux_hidden_states + else: + hidden_states_to_store = hidden_states + elif logits_metadata.capture_hidden_mode.is_last(): + # Get the last token hidden states. If sample_indices is None, + # pruned states only contain the last tokens already. + if aux_hidden_states is not None: + aux_pruned_states = torch.cat(aux_pruned_states, dim=-1) + hidden_states_to_store = ( + aux_pruned_states[sample_indices] + if sample_indices is not None + else aux_pruned_states + ) + else: + hidden_states_to_store = ( + pruned_states[sample_indices] + if sample_indices is not None + else pruned_states + ) + else: + assert False, "Should never reach" + + assert ( + not logits_metadata.extend_return_logprob + ), "extend_return_logprob is not supported" + # Decode mode or extend mode without return_logprob. + return ReplacedLogitsProcessorEagle3Output( + logits=logits, + aux_hidden_states=hidden_states_to_store, + last_hidden_states=last_hidden_states, + ) + + +class LogitsProcessorForEAGLE3(torch.nn.Module): + def __init__( + self, + logits_processor: LogitsProcessor, + return_last_hidden_states: bool = False, + return_logits: bool = False, + ): + super().__init__() + self.logits_processor = logits_processor + self.return_last_hidden_states = return_last_hidden_states + self.return_logits = return_logits + + def forward( + self, + input_ids, + hidden_states, + lm_head, + logits_metadata, + aux_hidden_states: Optional[torch.Tensor] = None, + hidden_states_before_norm: Optional[torch.Tensor] = None, + ) -> LogitsProcessorOutput: + logits_metadata.forward_mode = ForwardMode.DECODE + ret = replaced_logits_processor_forward_for_eagle3( + self.logits_processor, + input_ids, + hidden_states, + lm_head, + logits_metadata, + aux_hidden_states, + hidden_states_before_norm, + self.return_last_hidden_states, + self.return_logits, + ) + return ret + + +def wrap_eagle3_logits_processors_in_module( + module: nn.Module, return_full_logits: bool = False +): + """ + This function will wrap the SGLang's original logits processor with the modified one for EAGLE3. + """ + for name, submodule in module.named_modules(): + if isinstance(submodule, LogitsProcessor): + wrapped = LogitsProcessorForEAGLE3(submodule, return_full_logits) + setattr(module, name, wrapped) + print(f"wrapped {name} with LogitsProcessorForEAGLE3") diff --git a/idea1/specforge/modeling/target/target_head.py b/idea1/specforge/modeling/target/target_head.py new file mode 100644 index 0000000000000000000000000000000000000000..86ab4f501a536422f8205721f60f5e7e8c23fa05 --- /dev/null +++ b/idea1/specforge/modeling/target/target_head.py @@ -0,0 +1,97 @@ +import glob +import json +import os +from typing import Optional + +import torch +import torch.nn as nn +from huggingface_hub import snapshot_download +from safetensors import safe_open +from transformers import AutoConfig + +from specforge.utils import padding + + +class TargetHead(nn.Module): + def __init__(self, model_path, trust_remote_code: bool = False): + super().__init__() + self.config = AutoConfig.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) + self.text_config = getattr(self.config, "text_config", self.config) + + self.hidden_size = self.text_config.hidden_size + self.vocab_size = self.text_config.vocab_size + + self.fc = nn.Linear(self.hidden_size, self.vocab_size, bias=False) + + @classmethod + def from_pretrained( + cls, + model_path, + lm_head_key: str = "lm_head.weight", + cache_dir: Optional[str] = None, + trust_remote_code: bool = False, + ) -> "TargetHead": + target_head = cls(model_path, trust_remote_code=trust_remote_code) + target_head.load_weights( + model_path=model_path, + lm_head_key=lm_head_key, + cache_dir=cache_dir, + ) + target_head.freeze_weights() + target_head = target_head.eval().cuda().to(torch.bfloat16) + return target_head + + @torch.no_grad() + def load_weights( + self, + model_path, + lm_head_key: str = "lm_head.weight", + cache_dir: Optional[str] = None, + ): + if os.path.exists(model_path): + self.model_path = model_path + else: + self.model_path = snapshot_download(repo_id=model_path) + + # model_path is a local directory + # check if there is file ending with index.json + glob_path = os.path.join(self.model_path, "*.index.json") + index_json_path = glob.glob(glob_path) + + if len(index_json_path) == 0: + raise FileNotFoundError(f"No index.json file found in {self.model_path}") + if len(index_json_path) > 1: + raise FileNotFoundError( + f"Multiple index.json files found in {self.model_path}" + ) + index_json_path = index_json_path[0] + + with open(index_json_path, "r") as f: + index_json = json.load(f) + ckpt_file = index_json["weight_map"][lm_head_key] + + if ckpt_file.endswith(".safetensors"): + with safe_open( + os.path.join(self.model_path, ckpt_file), framework="pt" + ) as f: + lm_head = f.get_tensor(lm_head_key) + else: + state_dict = torch.load(os.path.join(self.model_path, ckpt_file)) + lm_head = state_dict[lm_head_key] + self.fc.weight.copy_(lm_head) + + def freeze_weights(self): + for param in self.fc.parameters(): + param.requires_grad = False + + def forward(self, hidden_states): + return self.fc(hidden_states) + + def preprocess(self, input_ids, target, loss_mask): + # apply pading + target = padding(target, left=False) + input_ids = padding(input_ids, left=False) + loss_mask = loss_mask[..., None] + return input_ids, target, loss_mask diff --git a/idea1/specforge/modeling/target/target_utils.py b/idea1/specforge/modeling/target/target_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9dacba6be6129d57651e2b761fce4d4dbea7744f --- /dev/null +++ b/idea1/specforge/modeling/target/target_utils.py @@ -0,0 +1,192 @@ +import gc +import glob +import json +import os +from typing import Optional + +import torch +import torch.nn as nn +from huggingface_hub import snapshot_download +from safetensors import safe_open +from transformers import AutoConfig + + +class TargetEmbeddingsAndHead(nn.Module): + """ + Efficiently loads only the embedding layer and lm_head from a pretrained model. + Handles safetensors slicing and Weight Tying correctly. + """ + + def __init__(self, config): + super().__init__() + self.config = config + # Support for MLLMs with separate text_config + if hasattr(config, "text_config"): + self.embed_tokens = nn.Embedding( + config.text_config.vocab_size, + config.text_config.hidden_size, + padding_idx=config.text_config.pad_token_id, + ) + self.lm_head = nn.Linear( + config.text_config.hidden_size, + config.text_config.vocab_size, + bias=False, + ) + else: + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + @classmethod + def from_pretrained( + cls, + model_path: str, + embed_key: Optional[str] = None, + lm_head_key: Optional[str] = None, + cache_dir: Optional[str] = None, + device: str = "cuda", + dtype: torch.dtype = torch.bfloat16, + trust_remote_code: bool = False, + ) -> "TargetEmbeddingsAndHead": + + # 1. Load Config + config = AutoConfig.from_pretrained( + model_path, cache_dir=cache_dir, trust_remote_code=trust_remote_code + ) + instance = cls(config) + + if embed_key is None: + embed_key = "model.embed_tokens.weight" + if lm_head_key is None: + lm_head_key = "lm_head.weight" + + # 2. Resolve Model Path + local_model_path = model_path + if not os.path.exists(local_model_path): + try: + local_model_path = snapshot_download( + repo_id=model_path, + cache_dir=cache_dir, + allow_patterns=["*.json", "*.safetensors", "*.bin", "*.model"], + ) + except Exception as e: + print(f"Warning: Snapshot download failed or path check failed: {e}") + + # 3. Handle Weight Tying + tie_weights = getattr(config, "tie_word_embeddings", False) + + # 4. Load Weights + instance._load_weights(local_model_path, embed_key, lm_head_key, tie_weights) + + # 5. Move to Device & Freeze + instance.to(device=device, dtype=dtype) + instance.eval() + instance.requires_grad_(False) + + return instance + + def _load_weights( + self, model_path: str, embed_key: str, lm_head_key: str, tie_weights: bool + ): + index_files = glob.glob(os.path.join(model_path, "*.index.json")) + weight_map = {} + files_to_load = {} + + if index_files: + with open(index_files[0], "r") as f: + index = json.load(f) + weight_map = index.get("weight_map", {}) + + if embed_key in weight_map: + files_to_load[embed_key] = weight_map[embed_key] + else: + raise ValueError( + f"Embedding key '{embed_key}' not found in weight map." + ) + + if not tie_weights: + if lm_head_key in weight_map: + files_to_load[lm_head_key] = weight_map[lm_head_key] + else: + print( + f"Warning: {lm_head_key} not found. Ensure model doesn't use tied weights manually." + ) + else: + safetensors = glob.glob(os.path.join(model_path, "*.safetensors")) + bins = glob.glob(os.path.join(model_path, "*.bin")) + target_file = safetensors[0] if safetensors else (bins[0] if bins else None) + + if not target_file: + raise FileNotFoundError("No checkpoint found.") + + files_to_load[embed_key] = os.path.basename(target_file) + if not tie_weights: + files_to_load[lm_head_key] = os.path.basename(target_file) + + loaded_keys = set() + + file_to_keys_map = {} + for key, filename in files_to_load.items(): + full_path = os.path.join(model_path, filename) + if full_path not in file_to_keys_map: + file_to_keys_map[full_path] = [] + file_to_keys_map[full_path].append(key) + + for file_path, keys in file_to_keys_map.items(): + self._load_file_content(file_path, keys, embed_key, lm_head_key) + loaded_keys.update(keys) + + if tie_weights: + print( + "Weight tying detected: Sharing weights between Embeddings and LM Head." + ) + self.lm_head.weight = self.embed_tokens.weight + + if embed_key not in loaded_keys: + raise RuntimeError("Failed to load embeddings.") + if not tie_weights and lm_head_key not in loaded_keys: + print( + "Warning: LM Head weights were not found (and tie_weights is False). Head is random." + ) + + def _load_file_content( + self, + file_path: str, + keys_to_extract: list, + target_embed_key: str, + target_head_key: str, + ): + """Helper to load specific keys from a file""" + print(f"Loading {keys_to_extract} from {os.path.basename(file_path)}...") + + state_dict_part = {} + + if file_path.endswith(".safetensors"): + with safe_open(file_path, framework="pt") as f: + for k in keys_to_extract: + if k in f.keys(): + state_dict_part[k] = f.get_tensor(k) + else: + print( + f"Warning: Loading .bin file {os.path.basename(file_path)} into RAM. Convert to safetensors for efficiency." + ) + full_state = torch.load(file_path, map_location="cpu") + for k in keys_to_extract: + if k in full_state: + state_dict_part[k] = full_state[k] + del full_state + gc.collect() + + for k, tensor in state_dict_part.items(): + if k == target_embed_key: + self.embed_tokens.weight.data.copy_(tensor) + print(" -> Loaded Embeddings") + elif k == target_head_key: + if tensor.shape == self.lm_head.weight.data.shape: + self.lm_head.weight.data.copy_(tensor) + print(" -> Loaded LM Head") + else: + raise RuntimeError( + f"Shape mismatch for {k}. Expected {self.lm_head.weight.shape}, got {tensor.shape}" + ) diff --git a/idea1/specforge/modeling/utils.py b/idea1/specforge/modeling/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4cdd45642c0761e7178c1990e2e1bab6420b15ea --- /dev/null +++ b/idea1/specforge/modeling/utils.py @@ -0,0 +1,11 @@ +import torch + + +@torch.no_grad() +def padding(tensor, left=True): + zeropadding = torch.zeros_like(tensor[:, -1:]) + if left: + tensor = torch.cat((zeropadding, tensor[:, :-1]), dim=1) + else: + tensor = torch.cat((tensor[:, 1:], zeropadding), dim=1) + return tensor diff --git a/idea1/specforge/optimizer.py b/idea1/specforge/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdd3ab8dd9f2960e3612da50469ba13792df83a --- /dev/null +++ b/idea1/specforge/optimizer.py @@ -0,0 +1,66 @@ +import torch + +from specforge.lr_scheduler import CosineAnnealingWarmupLR +from specforge.utils import print_on_rank0 + + +class BF16Optimizer: + def __init__( + self, + model, + lr, + weight_decay=0.0, + max_grad_norm=0.5, + total_steps=800_000, + warmup_ratio=0.015, + ): + # TODO: For now, we only support cosine annealing warmup lr scheduler and AdamW optimizer + # TODO: We should make these parameters configurable + # These magic numbers: weight_decay=0.0, max_grad_norm=0.5, total_steps=800k, warmup_steps=12k are copied from + # https://github.com/SafeAILab/EAGLE/blob/main/eagle/traineagle3/ds_config.json + self.model = model + self.model_params = [p for p in model.parameters() if p.requires_grad] + self.max_grad_norm = max_grad_norm + self.fp32_params = [ + p.detach().clone().to(torch.float32) for p in self.model_params + ] + for mp in self.fp32_params: + mp.requires_grad = True + self.optimizer = torch.optim.AdamW( + self.fp32_params, lr=lr, weight_decay=weight_decay + ) + self.scheduler = CosineAnnealingWarmupLR( + self.optimizer, + total_steps=total_steps, + warmup_steps=int(warmup_ratio * total_steps), + ) + + def step(self): + with torch.no_grad(): + for p, mp in zip(self.model_params, self.fp32_params): + mp.grad = ( + p.grad.detach().to(torch.float32) if p.grad is not None else None + ) + torch.nn.utils.clip_grad_norm_(self.fp32_params, self.max_grad_norm) + self.optimizer.step() + self.optimizer.zero_grad() + self.scheduler.step() + with torch.no_grad(): + for p, mp in zip(self.model_params, self.fp32_params): + p.data.copy_(mp.data.to(p.dtype)) + p.grad = None + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict["optimizer_state_dict"]) + print_on_rank0("Successfully loaded optimizer state_dict.") + self.scheduler.load_state_dict(state_dict["scheduler_state_dict"]) + print_on_rank0("Successfully loaded scheduler state_dict.") + + def state_dict(self): + return { + "optimizer_state_dict": self.optimizer.state_dict(), + "scheduler_state_dict": self.scheduler.state_dict(), + } + + def get_learning_rate(self): + return self.optimizer.param_groups[0]["lr"] diff --git a/idea1/specforge/tracker.py b/idea1/specforge/tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..b91794ce1418fe9360e3537211b2b8e4e1b53899 --- /dev/null +++ b/idea1/specforge/tracker.py @@ -0,0 +1,319 @@ +# tracker.py + +import abc +import netrc +import os +from typing import Any, Dict, Optional + +import torch.distributed as dist + +# --- Lazy Imports --- +# These libraries are imported only when their respective trackers are used. +try: + import wandb +except ImportError: + wandb = None + +try: + from torch.utils.tensorboard import SummaryWriter +except ImportError: + SummaryWriter = None + +try: + import swanlab +except ImportError: + swanlab = None + +try: + import mlflow +except ImportError: + mlflow = None + + +# --- End Lazy Imports --- + + +class Tracker(abc.ABC): + """ + Abstract Base Class for experiment trackers. + + Each tracker implementation should handle its own initialization, logging, + and cleanup. It should also provide a class method to validate + command-line arguments before initialization. + """ + + def __init__(self, args, output_dir: str): + self.args = args + self.output_dir = output_dir + self.rank = dist.get_rank() + self.is_initialized = False + + @classmethod + @abc.abstractmethod + def validate_args(cls, parser, args) -> None: + """ + Validate necessary arguments for this tracker. + This method is called during argument parsing. + It should raise an error if required arguments are missing. + """ + + @abc.abstractmethod + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None) -> None: + """ + Log metrics to the tracker. + """ + + @abc.abstractmethod + def close(self) -> None: + """ + Close the tracker and clean up resources. + """ + + +class NoOpTracker(Tracker): + """A tracker that does nothing, for when no tracking is desired.""" + + @classmethod + def validate_args(cls, parser, args): + pass # No arguments to validate + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + self.is_initialized = True # Considered initialized to do nothing + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + pass # Do nothing + + def close(self): + pass # Do nothing + + +class WandbTracker(Tracker): + """Tracks experiments using Weights & Biases.""" + + @staticmethod + def _default_wandb_dir() -> str: + # specforge/tracker.py -> project root is one level up + return os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "wandb")) + + @classmethod + def validate_args(cls, parser, args): + if wandb is None: + parser.error( + "To use --report-to wandb, you must install wandb: 'pip install wandb'" + ) + + if args.wandb_dir is None: + args.wandb_dir = cls._default_wandb_dir() + + if args.wandb_offline: + return + + if args.wandb_key is not None: + return + + if "WANDB_API_KEY" in os.environ: + args.wandb_key = os.environ["WANDB_API_KEY"] + return + + try: + netrc_path = os.path.expanduser("~/.netrc") + if os.path.exists(netrc_path): + netrc_file = netrc.netrc(netrc_path) + if "api.wandb.ai" in netrc_file.hosts: + _, _, password = netrc_file.authenticators("api.wandb.ai") + if password: + args.wandb_key = password + return + except (FileNotFoundError, netrc.NetrcParseError): + pass + + if args.wandb_key is None: + parser.error( + "When --report-to is 'wandb', you must provide a wandb API key via one of:\n" + " 1. --wandb-key argument\n" + " 2. WANDB_API_KEY environment variable\n" + " 3. `wandb login` command" + ) + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + if args.wandb_dir is None: + args.wandb_dir = self._default_wandb_dir() + os.makedirs(args.wandb_dir, exist_ok=True) + + if not args.wandb_offline: + wandb.login(key=args.wandb_key) + init_kwargs = { + "project": args.wandb_project, + "name": args.wandb_name, + "config": vars(args), + "dir": args.wandb_dir, + } + if args.wandb_offline: + init_kwargs["mode"] = "offline" + wandb.init(**init_kwargs) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + wandb.log(log_dict, step=step) + + def close(self): + if self.rank == 0 and self.is_initialized and wandb.run: + wandb.finish() + self.is_initialized = False + + +class SwanlabTracker(Tracker): + """Tracks experiments using SwanLab.""" + + @classmethod + def validate_args(cls, parser, args): + if swanlab is None: + parser.error( + "To use --report-to swanlab, you must install swanlab: 'pip install swanlab'" + ) + + if args.swanlab_key is not None: + return + if "SWANLAB_API_KEY" in os.environ: + args.swanlab_key = os.environ["SWANLAB_API_KEY"] + return + # Swanlab can run in anonymous mode if no key is provided in a non-distributed env. + # However, a key is often required for distributed runs to sync correctly. + if ( + dist.is_initialized() + and dist.get_world_size() > 1 + and args.swanlab_key is None + ): + parser.error( + "In a distributed environment, when --report-to is 'swanlab', you must provide a swanlab API key via:\n" + " 1. --swanlab-key argument\n" + " 2. SWANLAB_API_KEY environment variable" + ) + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + if args.swanlab_key: + swanlab.login(api_key=args.swanlab_key) + + swanlog_dir = os.path.join(output_dir, "swanlog") + os.makedirs(swanlog_dir, exist_ok=True) + swanlab.init( + project=args.swanlab_project, + experiment_name=args.swanlab_name, + config=vars(args), + logdir=swanlog_dir, + ) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + swanlab.log(log_dict, step=step) + + def close(self): + if self.rank == 0 and self.is_initialized and swanlab.get_run() is not None: + swanlab.finish() + self.is_initialized = False + + +class TensorboardTracker(Tracker): + """Tracks experiments using TensorBoard.""" + + @classmethod + def validate_args(cls, parser, args): + if SummaryWriter is None: + parser.error( + "To use --report-to tensorboard, you must have tensorboard installed: 'pip install tensorboard'" + ) + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + log_dir = os.path.join(output_dir, "runs") + self.writer = SummaryWriter(log_dir=log_dir) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + for key, value in log_dict.items(): + if isinstance(value, (int, float)): + self.writer.add_scalar(key, value, global_step=step) + + def close(self): + if self.rank == 0 and self.is_initialized: + self.writer.close() + self.is_initialized = False + + +class MLflowTracker(Tracker): + """Tracks experiments using MLflow.""" + + @classmethod + def validate_args(cls, parser, args): + if mlflow is None: + parser.error( + "To use --report-to mlflow, you must install mlflow: 'pip install mlflow'" + ) + # Set tracking URI from environment variable if not explicitly provided + if args.mlflow_tracking_uri is None and "MLFLOW_TRACKING_URI" in os.environ: + args.mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"] + elif args.mlflow_tracking_uri is None: + print( + "Warning: MLflow tracking URI not set. Defaulting to local './mlruns'." + ) + + # Set experiment name from environment variable if not explicitly provided + if ( + args.mlflow_experiment_name is None + and "MLFLOW_EXPERIMENT_NAME" in os.environ + ): + args.mlflow_experiment_name = os.environ["MLFLOW_EXPERIMENT_NAME"] + + def __init__(self, args, output_dir: str): + super().__init__(args, output_dir) + if self.rank == 0: + if args.mlflow_tracking_uri: + mlflow.set_tracking_uri(args.mlflow_tracking_uri) + + # This will either use the set URI or the default + mlflow.set_experiment(args.mlflow_experiment_name) + mlflow.start_run(run_name=args.mlflow_run_name) + mlflow.log_params(vars(args)) + self.is_initialized = True + + def log(self, log_dict: Dict[str, Any], step: Optional[int] = None): + if self.rank == 0 and self.is_initialized: + # MLflow's log_metrics takes a dictionary directly + mlflow.log_metrics(log_dict, step=step) + + def close(self): + if self.rank == 0 and self.is_initialized: + mlflow.end_run() + self.is_initialized = False + + +# --- Tracker Factory --- +TRACKER_REGISTRY = { + "wandb": WandbTracker, + "swanlab": SwanlabTracker, + "tensorboard": TensorboardTracker, + "mlflow": MLflowTracker, + "none": NoOpTracker, +} + + +def get_tracker_class(report_to: str) -> Optional[Tracker]: + """Returns the tracker class based on the name.""" + return TRACKER_REGISTRY.get(report_to) + + +def create_tracker(args, output_dir: str) -> Tracker: + """Factory function to create an experiment tracker instance.""" + tracker_class = get_tracker_class(args.report_to) + if not tracker_class: + raise ValueError(f"Unsupported report_to type: {args.report_to}") + return tracker_class(args, output_dir) diff --git a/idea1/specforge/utils.py b/idea1/specforge/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af4d627c8f7512721c65dc07b92b767abf2f418e --- /dev/null +++ b/idea1/specforge/utils.py @@ -0,0 +1,412 @@ +import json +import logging +import os +import re +from contextlib import contextmanager + +import torch +import torch.distributed as dist +from torch.distributed._tensor import DTensor, Shard, distribute_tensor +from transformers import AutoConfig, PretrainedConfig + +logger = logging.getLogger(__name__) + + +@contextmanager +def rank_0_priority(): + rank = dist.get_rank() + + if rank == 0: + yield + dist.barrier() + else: + dist.barrier() + yield + + +@contextmanager +def default_torch_dtype(dtype: torch.dtype): + current_dtype = torch.get_default_dtype() + torch.set_default_dtype(dtype) + yield + torch.set_default_dtype(current_dtype) + + +@torch.no_grad() +def padding(tensor, left=True): + zeropadding = torch.zeros_like(tensor[:, -1:]) + if left: + tensor = torch.cat((zeropadding, tensor[:, :-1]), dim=1) + else: + tensor = torch.cat((tensor[:, 1:], zeropadding), dim=1) + return tensor + + +def load_config_from_file(config_path: str): + with open(config_path, "r") as f: + config = json.load(f) + + return PretrainedConfig.from_dict(config) + + +def print_with_rank(message): + if dist.is_available() and dist.is_initialized(): + logger.info(f"rank {dist.get_rank()}: {message}") + else: + logger.info(f"non-distributed: {message}") + + +def print_args_with_dots(args): + if dist.get_rank() == 0: + args_dict = vars(args) + max_key_length = max(len(key) for key in args_dict.keys()) + total_width = 50 + + print("\n -----------【args】-----------") + for key, value in args_dict.items(): + key_str = f"{key:<{max_key_length}}" + value_str = str(value) + dot_count = total_width - len(key_str) - len(value_str) + dot_fill = "·" * dot_count + print(f"{key_str} {dot_fill} {value_str}") + + +def print_on_rank0(message): + if dist.get_rank() == 0: + logger.info(message) + + +def get_last_checkpoint(folder, prefix="epoch"): + """ + Get the latest checkpoint directory along with its epoch and step information. + + Args: + folder: The folder path containing checkpoints. + prefix: The prefix for checkpoint directories, default is "epoch". + + Returns: + tuple: (checkpoint_path, epoch, step) + - Returns (None, None, None) if no checkpoint is found. + - step is 0 if not present in the directory name. + """ + content = os.listdir(folder) + # Match: epoch_X or epoch_X_step_Y + _re_checkpoint = re.compile(rf"^{re.escape(prefix)}_(\d+)(?:_step_(\d+))?$") + + checkpoints = [ + path + for path in content + if _re_checkpoint.search(path) is not None + and os.path.isdir(os.path.join(folder, path)) + ] + + if len(checkpoints) == 0: + return None, (0, 0) + + # Sort key: (epoch, step), step=0 when not present + def sort_key(x): + match = _re_checkpoint.search(x) + epoch = int(match.group(1)) + step = int(match.group(2)) if match.group(2) else 0 + return (epoch, step) + + last_checkpoint = max(checkpoints, key=sort_key) + match = _re_checkpoint.search(last_checkpoint) + epoch = int(match.group(1)) + step = int(match.group(2)) if match.group(2) else 0 + + return os.path.join(folder, last_checkpoint), (epoch, step) + + +def generate_draft_model_config( + target_model_path: str, template_config_path: str = None, cache_dir: str = None +): + """ + Auto-generate draft model config based on target model parameters aligned with template config + + Args: + target_model_path (str): Path to the target model + template_config_path (str, optional): Template config file path, defaults to llama3-8B-eagle3.json + cache_dir (str, optional): Cache directory + + Returns: + dict: Generated draft model config dictionary + """ + # Get target model config + target_config = AutoConfig.from_pretrained(target_model_path, cache_dir=cache_dir) + + # If no template specified, use default llama3-8B-eagle3.json + if template_config_path is None: + # Use the script execution directory as base + import sys + + script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + project_root = os.path.dirname(script_dir) # Go up one level from scripts/ + template_config_path = os.path.join( + project_root, "configs", "llama3-8B-eagle3.json" + ) + + # Read template config + with open(template_config_path, "r") as f: + draft_config = json.load(f) + + # Adjust architecture config based on target model type + if hasattr(target_config, "model_type"): + # Default to llama architecture + draft_config["model_type"] = "llama" + + # Align key parameters + param_mappings = { + "vocab_size": "vocab_size", + "hidden_size": "hidden_size", + "num_attention_heads": "num_attention_heads", + "num_key_value_heads": "num_key_value_heads", + "intermediate_size": "intermediate_size", + "max_position_embeddings": "max_position_embeddings", + "rms_norm_eps": "rms_norm_eps", + "hidden_act": "hidden_act", + "bos_token_id": "bos_token_id", + "eos_token_id": "eos_token_id", + "torch_dtype": "torch_dtype", + } + + # Copy parameters from target model to draft config + for target_param, draft_param in param_mappings.items(): + if hasattr(target_config, target_param): + value = getattr(target_config, target_param) + # Special handling for torch_dtype to make it JSON serializable + if target_param == "torch_dtype" and isinstance(value, torch.dtype): + value = str(value).replace("torch.", "") + draft_config[draft_param] = value + + # Special handling for some parameters + # Ensure num_hidden_layers is always 1 (EAGLE3 feature) + draft_config["num_hidden_layers"] = 1 + + # Keep some fixed draft model specific parameters + draft_config["tie_word_embeddings"] = False + draft_config["use_cache"] = True + + # If template doesn't have draft_vocab_size, set default + if "draft_vocab_size" not in draft_config: + draft_config["draft_vocab_size"] = 32000 # Default value + + return draft_config + + +def save_draft_model_config(config_dict: dict, output_path: str): + """ + Save draft model config to file + + Args: + config_dict (dict): Config dictionary + output_path (str): Output file path + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(config_dict, f, indent=2, ensure_ascii=False) + + print(f"Draft model config saved to: {output_path}") + + +def create_draft_config_from_target( + target_model_path: str, + output_dir: str = None, + template_config_path: str = None, + cache_dir: str = None, +): + """ + Convenient function to create draft model config file from target model + + Args: + target_model_path (str): Target model path + output_dir (str, optional): Output directory, defaults to configs folder in current directory + template_config_path (str, optional): Template config path + cache_dir (str, optional): Cache directory + + Returns: + str: Generated config file path + """ + # Generate config + rank = dist.get_rank() + + if rank == 0: + print_with_rank( + "No draft model config provided, auto-generating from target model..." + ) + config_dict = generate_draft_model_config( + target_model_path, template_config_path, cache_dir + ) + dist.barrier() + + # Determine output path + if output_dir is None: + # Use the script execution directory as base + import sys + + script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + project_root = os.path.dirname(script_dir) # Go up one level from scripts/ + output_dir = os.path.join(project_root, "configs") + + # Extract model name from model path + model_name = target_model_path.split("/")[-1].lower() + output_filename = f"{model_name}-eagle3-auto.json" + output_path = os.path.join(output_dir, output_filename) + + # Save config + if rank == 0: + save_draft_model_config(config_dict, output_path) + print_with_rank(f"Auto-generated draft model config saved to: {output_path}") + dist.barrier() + + return output_path + + +def get_full_optimizer_state(optimizer_state_dict: dict): + """ + Convert optimizer state dict with DTensor to full tensors for saving + + Args: + optimizer_state_dict (dict): Optimizer state dict possibly containing DTensors + Returns: + dict: Optimizer state dict with full tensors + """ + full_optimizer_state_dict = { + k: v for k, v in optimizer_state_dict.items() if k != "state" + } + if "state" in optimizer_state_dict: + full_optimizer_state_dict["state"] = { + param_id: { + state_key: ( + state_tensor.full_tensor() + if isinstance(state_tensor, torch.distributed.tensor.DTensor) + else state_tensor + ) + for state_key, state_tensor in param_state.items() + } + for param_id, param_state in optimizer_state_dict["state"].items() + } + return full_optimizer_state_dict + + +def shard_optimizer_state_with_dtensor(bf16_optimizer, device_mesh): + """ + Shards the optimizer state tensors of a BF16Optimizer instance using DTensor. + + Args: + bf16_optimizer (BF16Optimizer): An instance of BF16Optimizer, which contains + the actual optimizer (e.g., torch.optim.Adam) as its `.optimizer` attribute. + """ + + optim = bf16_optimizer.optimizer + + for group in optim.param_groups: + for p in group["params"]: + if not isinstance(p, DTensor): + continue + + state = optim.state.get(p, None) + if state is None: + continue + + mesh = device_mesh + placements = (Shard(dim=0),) + + for k, v in list(state.items()): + if k == "step": + continue + + if isinstance(v, DTensor): + continue + + if not isinstance(v, torch.Tensor): + continue + + state[k] = distribute_tensor( + v.to(p.device), device_mesh=mesh, placements=placements + ) + + +def safe_conversations_generator(file_path): + """ + Generator that: + 1. Extracts the 'conversations' field. + 2. Preserves all original fields within each message. + 3. [Key step] Converts all list/dict-type field values to strings to resolve mixed-type conflicts (e.g., for Arrow compatibility). + """ + with open(file_path, "r", encoding="utf-8") as f: + for i, line in enumerate(f): + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + raw_convs = row.get("conversations", []) + + # 1. Ensure 'conversations' is a list + if not isinstance(raw_convs, list): + # If it's None or some unexpected type, treat as empty or skip + if raw_convs is None: + raw_convs = [] + else: + # Edge case: 'conversations' is a plain string or non-iterable—skip this line + logger.warning( + f"Line {i + 1}: 'conversations' is not a list. Please check!" + ) + continue + + cleaned_convs = [] + for msg in raw_convs: + # 2. Ensure each item in the list is a dictionary + if not isinstance(msg, dict): + # Skip if an element is not a dict (e.g., malformed like ["user", "hi"]) + continue + + # 3. [Core logic] Iterate over all fields in the message (role, content, tools, etc.) + new_msg = {} + for k, v in msg.items(): + # If the value is a list or dict, serialize it to a JSON string + # This ensures Arrow treats the column as string type instead of list/struct + if isinstance(v, (list, dict)): + new_msg[k] = json.dumps(v, ensure_ascii=False) + else: + # Keep primitive types (str, int, float, bool, None) unchanged + new_msg[k] = v + + cleaned_convs.append(new_msg) + + # Build result with conversations + result = {"conversations": cleaned_convs} + + # Preserve 'tools' field if present + if "tools" in row: + tools = row["tools"] + if tools is not None: + # If tools is a JSON string, parse it first + if isinstance(tools, str): + try: + tools = json.loads(tools) + except json.JSONDecodeError: + logger.warning( + f"Line {i + 1}: 'tools' is a string but not valid JSON, keeping as-is" + ) + result["tools"] = tools + yield result + continue + + # Serialize tools to JSON string for Arrow compatibility + # (same treatment as list/dict fields in conversations) + if isinstance(tools, (list, dict)): + result["tools"] = json.dumps(tools, ensure_ascii=False) + else: + # Primitive type, keep as-is + result["tools"] = tools + else: + result["tools"] = [] + + yield result + + except Exception as e: + logger.warning(f"Skipping line {i + 1}: {e}") + continue diff --git a/idea1/tests/__init__.py b/idea1/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/idea1/tests/ci/gpu_lock_exec.py b/idea1/tests/ci/gpu_lock_exec.py new file mode 100644 index 0000000000000000000000000000000000000000..6ca44c6b66c73ad26be2eac0505626857aec64a4 --- /dev/null +++ b/idea1/tests/ci/gpu_lock_exec.py @@ -0,0 +1,249 @@ +import argparse +import fcntl +import os +import random +import sys +import time +from typing import List + +SLEEP_BACKOFF = 5.0 + + +def main(): + """ + Remark: Can use `lslocks` to debug + """ + args = _parse_args() + + if args.print_only: + _execute_print_only(args) + return + + fd_locks = _try_acquire(args) + + dev_list = ",".join(str(x.gpu_id) for x in fd_locks) + os.environ["CUDA_VISIBLE_DEVICES"] = dev_list + + if args.env: + for env_var in args.env: + name, value = env_var.split("=") + os.environ[name] = value + print( + f"[gpu_lock_exec] Setting environment variable: {name}={value}", + flush=True, + ) + print(f"[gpu_lock_exec] Acquired GPUs: {dev_list}", flush=True) + + _os_execvp(args) + + +def _os_execvp(args): + cmd = args.cmd + if cmd[0] == "--": + cmd = cmd[1:] + + # propagate the environment variables + os.execvp(cmd[0], cmd) + + +def _parse_args(): + p = argparse.ArgumentParser() + p.add_argument( + "--count", type=int, default=None, help="Acquire this many GPUs (any free ones)" + ) + p.add_argument( + "--devices", + type=str, + default=None, + help="Comma separated explicit devices to acquire (e.g. 0,1)", + ) + p.add_argument( + "--total-gpus", type=int, default=8, help="Total GPUs on the machine" + ) + p.add_argument( + "--timeout", + type=int, + default=3600, + help="Seconds to wait for locks before failing", + ) + p.add_argument( + "--env", + type=str, + default=None, + nargs="*", + help="Environment variables to set (e.g. HF_TOKEN=1234567890)", + ) + p.add_argument( + "--lock-path-pattern", + type=str, + default="/dev/shm/custom_gpu_lock_{gpu_id}.lock", + help='Filename pattern with "{gpu_id}" placeholder', + ) + p.add_argument( + "--print-only", + action="store_true", + help="Probe free devices and print them (does NOT hold locks)", + ) + p.add_argument( + "cmd", + nargs=argparse.REMAINDER, + help="Command to exec after '--' (required unless --print-only)", + ) + args = p.parse_args() + + if "{gpu_id}" not in args.lock_path_pattern: + raise Exception("ERROR: --lock-path-pattern must contain '{i}' placeholder.") + + if not args.cmd and not args.print_only: + raise Exception("ERROR: missing command to run. Use -- before command.") + + return args + + +def _execute_print_only(args): + free = [] + _ensure_lock_files(path_pattern=args.lock_path_pattern, total_gpus=args.total_gpus) + for i in range(args.total_gpus): + try: + fd_lock = FdLock(args.lock_path_pattern, i) + fd_lock.open() + try: + fd_lock.lock() + fcntl.flock(fd_lock.fd, fcntl.LOCK_UN) + free.append(i) + except BlockingIOError: + pass + fd_lock.close() + except Exception as e: + print( + f"Warning: Error while probing lock: {e}", file=sys.stderr, flush=True + ) + + print("Free GPUs:", ",".join(str(x) for x in free), flush=True) + + +def _try_acquire(args): + if args.devices: + devs = _parse_devices(args.devices) + return _try_acquire_specific(devs, args.lock_path_pattern, args.timeout) + else: + return _try_acquire_count( + args.count, args.total_gpus, args.lock_path_pattern, args.timeout + ) + + +def _try_acquire_specific(devs: List[int], path_pattern: str, timeout: int): + fd_locks = [] + start = time.time() + try: + _ensure_lock_files(path_pattern, max(devs) + 1) + for gpu_id in devs: + fd_lock = FdLock(path_pattern, gpu_id=gpu_id) + fd_lock.open() + while True: + try: + fd_lock.lock() + break + except BlockingIOError: + if time.time() - start > timeout: + raise TimeoutError(f"Timeout while waiting for GPU {gpu_id}") + time.sleep(SLEEP_BACKOFF * random.random()) + fd_locks.append(fd_lock) + return fd_locks + except Exception as e: + print( + f"Error during specific GPU acquisition: {e}", file=sys.stderr, flush=True + ) + for fd_lock in fd_locks: + fd_lock.close() + raise + + +def _try_acquire_count(count: int, total_gpus: int, path_pattern: str, timeout: int): + start = time.time() + _ensure_lock_files(path_pattern, total_gpus) + while True: + fd_locks: List = [] + for gpu_id in range(total_gpus): + fd_lock = FdLock(path_pattern, gpu_id=gpu_id) + fd_lock.open() + try: + fd_lock.lock() + except BlockingIOError: + fd_lock.close() + continue + + fd_locks.append(fd_lock) + if len(fd_locks) == count: + return fd_locks + + gotten_gpu_ids = [x.gpu_id for x in fd_locks] + for fd_lock in fd_locks: + fd_lock.close() + del fd_lock + + if time.time() - start > timeout: + raise TimeoutError(f"Timeout acquiring {count} GPUs (out of {total_gpus})") + + print( + f"[gpu_lock_exec] try_acquire_count failed, sleep and retry (only got: {gotten_gpu_ids})", + flush=True, + ) + time.sleep(SLEEP_BACKOFF * random.random()) + + +class FdLock: + def __init__(self, path_pattern, gpu_id: int): + self.gpu_id = gpu_id + self.path = _get_lock_path(path_pattern, self.gpu_id) + self.fd = None + + def open(self): + assert self.fd is None + self.fd = open(self.path, "a+") + # try to avoid lock disappear when execvp + os.set_inheritable(self.fd.fileno(), True) + + def lock(self): + assert self.fd is not None + fcntl.flock(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + + def close(self): + assert self.fd is not None + try: + self.fd.close() + except Exception as e: + print( + f"Warning: Failed to close file descriptor: {e}", + file=sys.stderr, + flush=True, + ) + self.fd = None + + +def _ensure_lock_files(path_pattern: str, total_gpus: int): + lock_dir = os.path.dirname(path_pattern) + if lock_dir: + os.makedirs(lock_dir, exist_ok=True) + for gpu_id in range(total_gpus): + p = _get_lock_path(path_pattern, gpu_id) + try: + open(p, "a").close() + except Exception as e: + print( + f"Warning: Could not create lock file {p}: {e}", + file=sys.stderr, + flush=True, + ) + + +def _get_lock_path(path_pattern: str, gpu_id: int) -> str: + return path_pattern.format(gpu_id=gpu_id) + + +def _parse_devices(s: str) -> List[int]: + return [int(x) for x in s.split(",") if x.strip() != ""] + + +if __name__ == "__main__": + main() diff --git a/idea1/tests/test_data/__init__.py b/idea1/tests/test_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/idea1/tests/test_data/data/tool_use_conversation.jsonl b/idea1/tests/test_data/data/tool_use_conversation.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98145132ca1d9d2e0bae1aa375d7f46c0873fb28 --- /dev/null +++ b/idea1/tests/test_data/data/tool_use_conversation.jsonl @@ -0,0 +1 @@ +{"id": 9, "conversations": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": ":user content."}, {"role": "assistant", "content": "assistant content1.", "reasoning_content": "", "tool_calls": [{"id": "call_111", "type": "function", "function": {"name": "read_file", "arguments": "{\"end_line\": 120, \"file_path\": \"xml.jpg\", \"start_line\": 1}"}}]}, {"role": "tool", "content": "tool content1."}, {"role": "assistant", "content": "assistant content2.", "reasoning_content": "", "tool_calls": [{"id": "call_222", "type": "function", "function": {"name": "search_replace", "arguments": "{\"file_path\": \"spec.pdf\", \"replacements\": \n[{\"new_text\": \"