Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- SpecForge-ext/.github/ISSUE_TEMPLATE/1-bug-report.yaml +38 -0
- SpecForge-ext/.github/ISSUE_TEMPLATE/2-feature-request.yaml +23 -0
- SpecForge-ext/.github/workflows/lint.yaml +22 -0
- SpecForge-ext/.github/workflows/publish_docs.yaml +72 -0
- SpecForge-ext/.github/workflows/publish_pypi.yaml +33 -0
- SpecForge-ext/.github/workflows/test.yaml +63 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/b7/fb7yof3yk2k4yeeufmf3rp4g2vv24l2ugw2wy6l6vsmz4q7x37uw/dqu44yvnqab6lpc7c524ppofvnrpyphyicmvwgol774itktqbe6 +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/d5/fd5mtfiljkcqso2ovhkqewcgmm352ybgny3jo64kzoxueahy6joc/mdrdlva6q5sia32yf5vu6qd2ly7pmheoa7pyfstdzofb743b37j +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/gy/fgyjasy24lyvf45hmbtxzqve4lgbh5xzxmkza7fmcqqortd67gcc/4bg5eqhunja4mv5ckfxus66wcew7soy42pgbqzxrrjxj7hxkuyi +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/kp/fkp2diorfj5u3lv4yqas3fhord3y5dha4rxjvk6clv6mpo6wq5ts/72shc3jpmfkbxncw3rpaeflm7lrlu74btuqitszauoym3ykbgak +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/ut/futctst56igpyuhuqwbj7ifo6wjbelbfftnnnzt6mvpdv4laznjz/dhqctmvlhieh3qlw4a5j6y2cphxvtax4r6yljewoxpn6hjg6coj +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/va/fvayrjdgzr3pmbuvfegior263vfw5xzrxpu77pd2o4whnn4i7oe2/r46qg2hi5tlizn4e6hm6gfprjga5kb46ijq7utpaiuhyp7zzokk +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/yw/fyw74tihmwdurnkl74w5ng6i55dk7dj65ql2ezo5bq4cxgbcw5p5/qhhy6gvdfgumsjojnqxnbpx2e5yb5fsfqmeafki2x7itryz4zt3 +0 -0
- SpecForge-ext/cache/compiled_kernels/fxgraph/zh/fzh3cbljcdwt76rsppwrcnk6dkxcmc5r6vtprpqx5patcfb3rsuv/z5dccw35nurtwmemllsbeulds4dudef7jdbkz5xya7g52sv36tr +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/__grp__triton_red_fused_argmax_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir +206 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx +490 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source +323 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir +218 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir +217 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir +266 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx +640 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source +379 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir +270 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir +246 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir +841 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir +799 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/__grp__triton_red_fused_argmax_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir +1166 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source +323 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir +217 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir +213 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir +318 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx +736 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source +418 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir +280 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir +283 -0
- SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
SpecForge-ext/.github/ISSUE_TEMPLATE/1-bug-report.yaml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: 🐞 Bug report
|
| 2 |
+
description: Create a report to help us reproduce and fix the bug
|
| 3 |
+
title: "[Bug] "
|
| 4 |
+
labels: ['Bug']
|
| 5 |
+
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checklist
|
| 10 |
+
options:
|
| 11 |
+
- label: 1. I have searched related issues but cannot get the expected help.
|
| 12 |
+
- label: 2. The bug has not been fixed in the latest version.
|
| 13 |
+
- label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
|
| 14 |
+
- label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/SpecForge/discussions/new/choose Otherwise, it will be closed.
|
| 15 |
+
- label: 5. Please use English, otherwise it will be closed.
|
| 16 |
+
- type: textarea
|
| 17 |
+
attributes:
|
| 18 |
+
label: Describe the bug
|
| 19 |
+
description: A clear and concise description of what the bug is.
|
| 20 |
+
validations:
|
| 21 |
+
required: true
|
| 22 |
+
- type: textarea
|
| 23 |
+
attributes:
|
| 24 |
+
label: Reproduction
|
| 25 |
+
description: |
|
| 26 |
+
What command or script did you run? Which **model** are you using?
|
| 27 |
+
placeholder: |
|
| 28 |
+
A placeholder for the command.
|
| 29 |
+
validations:
|
| 30 |
+
required: true
|
| 31 |
+
- type: textarea
|
| 32 |
+
attributes:
|
| 33 |
+
label: Environment
|
| 34 |
+
description: |
|
| 35 |
+
Please provide necessary environment information here. Otherwise the issue will be closed.
|
| 36 |
+
placeholder: Environment here.
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
SpecForge-ext/.github/ISSUE_TEMPLATE/2-feature-request.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: 🚀 Feature request
|
| 2 |
+
description: Suggest an idea for this project
|
| 3 |
+
title: "[Feature] "
|
| 4 |
+
|
| 5 |
+
body:
|
| 6 |
+
- type: checkboxes
|
| 7 |
+
attributes:
|
| 8 |
+
label: Checklist
|
| 9 |
+
options:
|
| 10 |
+
- label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/SpecForge/discussions/new/choose Otherwise, it will be closed.
|
| 11 |
+
- label: 2. Please use English, otherwise it will be closed.
|
| 12 |
+
- type: textarea
|
| 13 |
+
attributes:
|
| 14 |
+
label: Motivation
|
| 15 |
+
description: |
|
| 16 |
+
A clear and concise description of the motivation of the feature.
|
| 17 |
+
validations:
|
| 18 |
+
required: true
|
| 19 |
+
- type: textarea
|
| 20 |
+
attributes:
|
| 21 |
+
label: Related resources
|
| 22 |
+
description: |
|
| 23 |
+
If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
|
SpecForge-ext/.github/workflows/lint.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Lint
|
| 2 |
+
|
| 3 |
+
on: [ pull_request ]
|
| 4 |
+
|
| 5 |
+
jobs:
|
| 6 |
+
lint:
|
| 7 |
+
runs-on: ubuntu-latest
|
| 8 |
+
steps:
|
| 9 |
+
- uses: actions/checkout@v4
|
| 10 |
+
|
| 11 |
+
- name: Set up Python
|
| 12 |
+
uses: actions/setup-python@v4
|
| 13 |
+
with:
|
| 14 |
+
python-version: '3.11'
|
| 15 |
+
|
| 16 |
+
- name: Install pre-commit hook
|
| 17 |
+
run: |
|
| 18 |
+
python -m pip install pre-commit
|
| 19 |
+
pre-commit install
|
| 20 |
+
|
| 21 |
+
- name: Linting
|
| 22 |
+
run: pre-commit run --all-files --show-diff-on-failure
|
SpecForge-ext/.github/workflows/publish_docs.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Release Documentation
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
paths:
|
| 8 |
+
- "docs/**"
|
| 9 |
+
- "version.txt"
|
| 10 |
+
workflow_dispatch:
|
| 11 |
+
|
| 12 |
+
concurrency:
|
| 13 |
+
group: release-docs-${{ github.ref }}
|
| 14 |
+
cancel-in-progress: true
|
| 15 |
+
|
| 16 |
+
jobs:
|
| 17 |
+
deploy-github-pages:
|
| 18 |
+
runs-on: ubuntu-latest
|
| 19 |
+
if: github.repository == 'sgl-project/specforge' || github.repository == 'sleepcoo/SpecForge'
|
| 20 |
+
permissions:
|
| 21 |
+
contents: write
|
| 22 |
+
steps:
|
| 23 |
+
- name: Checkout code
|
| 24 |
+
uses: actions/checkout@v4
|
| 25 |
+
|
| 26 |
+
- name: Setup Python
|
| 27 |
+
uses: actions/setup-python@v5
|
| 28 |
+
with:
|
| 29 |
+
python-version: '3.13'
|
| 30 |
+
|
| 31 |
+
- name: Setup Node.js
|
| 32 |
+
uses: actions/setup-node@v4
|
| 33 |
+
with:
|
| 34 |
+
node-version: '20'
|
| 35 |
+
cache: 'npm'
|
| 36 |
+
cache-dependency-path: docs/spec_bundle/package-lock.json
|
| 37 |
+
|
| 38 |
+
- name: Install dependencies
|
| 39 |
+
run: |
|
| 40 |
+
sudo apt-get update && sudo apt-get install -y pandoc parallel retry
|
| 41 |
+
pip install -r docs/requirements.txt
|
| 42 |
+
|
| 43 |
+
- name: Build spec bundle dashboard
|
| 44 |
+
run: |
|
| 45 |
+
# Copy logos to public directory
|
| 46 |
+
cp assets/logo.png docs/spec_bundle/public/logo.png
|
| 47 |
+
cp docs/_static/imgs/specbundle-logo.png docs/spec_bundle/public/specbundle-logo.png
|
| 48 |
+
cd docs/spec_bundle
|
| 49 |
+
npm ci
|
| 50 |
+
npm run build
|
| 51 |
+
# Clean up node_modules to prevent Sphinx from processing them
|
| 52 |
+
rm -rf node_modules
|
| 53 |
+
cd ..
|
| 54 |
+
|
| 55 |
+
- name: Build documentation
|
| 56 |
+
run: |
|
| 57 |
+
cd docs
|
| 58 |
+
make compile
|
| 59 |
+
make html
|
| 60 |
+
# Copy SpecBundle to root of output directory
|
| 61 |
+
mkdir -p _build/html/SpecBundle
|
| 62 |
+
cp -r spec_bundle/dist/* _build/html/SpecBundle/
|
| 63 |
+
|
| 64 |
+
- name: Add .nojekyll file
|
| 65 |
+
run: |
|
| 66 |
+
touch ./docs/_build/html/.nojekyll
|
| 67 |
+
|
| 68 |
+
- name: Deploy
|
| 69 |
+
uses: peaceiris/actions-gh-pages@v4
|
| 70 |
+
with:
|
| 71 |
+
github_token: ${{ secrets.GITHUB_TOKEN }}
|
| 72 |
+
publish_dir: ./docs/_build/html
|
SpecForge-ext/.github/workflows/publish_pypi.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Publish to PyPI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
workflow_dispatch:
|
| 5 |
+
|
| 6 |
+
jobs:
|
| 7 |
+
build-n-publish:
|
| 8 |
+
if: github.event_name == 'workflow_dispatch'
|
| 9 |
+
name: Build and publish Python distributions to PyPI
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
timeout-minutes: 20
|
| 12 |
+
environment:
|
| 13 |
+
name: pypi
|
| 14 |
+
url: https://pypi.org/p/specforgeee
|
| 15 |
+
permissions:
|
| 16 |
+
id-token: write
|
| 17 |
+
steps:
|
| 18 |
+
- uses: actions/checkout@v2
|
| 19 |
+
|
| 20 |
+
- uses: actions/setup-python@v2
|
| 21 |
+
with:
|
| 22 |
+
python-version: '3.11'
|
| 23 |
+
|
| 24 |
+
- run: python setup.py sdist build
|
| 25 |
+
|
| 26 |
+
# publish to PyPI if executed on the main branch
|
| 27 |
+
- name: Publish package to PyPI
|
| 28 |
+
id: publish
|
| 29 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
| 30 |
+
with:
|
| 31 |
+
user: __token__
|
| 32 |
+
password: ${{ secrets.PYPI_TOKEN }}
|
| 33 |
+
verbose: true
|
SpecForge-ext/.github/workflows/test.yaml
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: PR Test
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
pull_request:
|
| 5 |
+
branches: [ main ]
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
concurrency:
|
| 9 |
+
group: pr-test-${{ github.ref }}
|
| 10 |
+
cancel-in-progress: true
|
| 11 |
+
|
| 12 |
+
permissions:
|
| 13 |
+
contents: read
|
| 14 |
+
|
| 15 |
+
jobs:
|
| 16 |
+
unit-test:
|
| 17 |
+
if: (github.repository == 'sgl-project/SpecForge' || github.event_name == 'pull_request') &&
|
| 18 |
+
github.event.pull_request.draft == false
|
| 19 |
+
runs-on: [self-hosted]
|
| 20 |
+
container:
|
| 21 |
+
image: lmsysorg/sglang:v0.5.5 # we lock to this version to avoid repeated docker pull
|
| 22 |
+
options: --gpus all --shm-size=2g --rm -v /dev/shm
|
| 23 |
+
steps:
|
| 24 |
+
- name: Checkout code
|
| 25 |
+
uses: actions/checkout@v4
|
| 26 |
+
|
| 27 |
+
- name: Restore cache
|
| 28 |
+
run: |
|
| 29 |
+
if [ -d /github/home/cache ] && [ ! -z "$(ls -A /github/home/cache/)" ]; then
|
| 30 |
+
cp -p -r /github/home/cache ./
|
| 31 |
+
fi
|
| 32 |
+
|
| 33 |
+
if [ -d /github/home/sf ] && [ ! -z "$(ls -A /github/home/sf/)" ]; then
|
| 34 |
+
cp -p -r /github/home/sf ./
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
- name: Remove flashinfer # this is needed to avoid flashinfer jit compilation makes the program hang
|
| 38 |
+
run: |
|
| 39 |
+
rm -rf /github/home/.cache/flashinfer
|
| 40 |
+
|
| 41 |
+
- name: Install dependencies
|
| 42 |
+
shell: bash
|
| 43 |
+
run: |
|
| 44 |
+
# if sf venv does not exist, create it
|
| 45 |
+
if [ ! -d sf ]; then
|
| 46 |
+
uv venv sf -p 3.11
|
| 47 |
+
fi
|
| 48 |
+
source sf/bin/activate
|
| 49 |
+
uv pip install setuptools
|
| 50 |
+
MAX_JOBS=8 uv pip install -v ".[fa]" --prerelease=allow --no-build-isolation
|
| 51 |
+
|
| 52 |
+
- name: Run test
|
| 53 |
+
timeout-minutes: 30
|
| 54 |
+
shell: bash
|
| 55 |
+
run: |
|
| 56 |
+
source sf/bin/activate
|
| 57 |
+
export PYTHONPATH=$PWD
|
| 58 |
+
python -m unittest discover -s ./tests -p "test_*.py" -v
|
| 59 |
+
|
| 60 |
+
- name: Save cache
|
| 61 |
+
run: |
|
| 62 |
+
cp -p -r sf /github/home/
|
| 63 |
+
cp -p -r cache /github/home/
|
SpecForge-ext/cache/compiled_kernels/fxgraph/b7/fb7yof3yk2k4yeeufmf3rp4g2vv24l2ugw2wy6l6vsmz4q7x37uw/dqu44yvnqab6lpc7c524ppofvnrpyphyicmvwgol774itktqbe6
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/d5/fd5mtfiljkcqso2ovhkqewcgmm352ybgny3jo64kzoxueahy6joc/mdrdlva6q5sia32yf5vu6qd2ly7pmheoa7pyfstdzofb743b37j
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/gy/fgyjasy24lyvf45hmbtxzqve4lgbh5xzxmkza7fmcqqortd67gcc/4bg5eqhunja4mv5ckfxus66wcew7soy42pgbqzxrrjxj7hxkuyi
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/kp/fkp2diorfj5u3lv4yqas3fhord3y5dha4rxjvk6clv6mpo6wq5ts/72shc3jpmfkbxncw3rpaeflm7lrlu74btuqitszauoym3ykbgak
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/ut/futctst56igpyuhuqwbj7ifo6wjbelbfftnnnzt6mvpdv4laznjz/dhqctmvlhieh3qlw4a5j6y2cphxvtax4r6yljewoxpn6hjg6coj
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/va/fvayrjdgzr3pmbuvfegior263vfw5xzrxpu77pd2o4whnn4i7oe2/r46qg2hi5tlizn4e6hm6gfprjga5kb46ijq7utpaiuhyp7zzokk
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/yw/fyw74tihmwdurnkl74w5ng6i55dk7dj65ql2ezo5bq4cxgbcw5p5/qhhy6gvdfgumsjojnqxnbpx2e5yb5fsfqmeafki2x7itryz4zt3
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/fxgraph/zh/fzh3cbljcdwt76rsppwrcnk6dkxcmc5r6vtprpqx5patcfb3rsuv/z5dccw35nurtwmemllsbeulds4dudef7jdbkz5xya7g52sv36tr
ADDED
|
Binary file (54.4 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/__grp__triton_red_fused_argmax_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.cubin
ADDED
|
Binary file (14.6 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "d764c4de3a434d91651ee340a32edffa72e41bcb0125e69cee815d3536f2f3ce", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"}
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.llir
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%10 = shl i32 %9, 6, !dbg !8
|
| 11 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%12 = and i32 %11, 252, !dbg !9
|
| 13 |
+
%13 = lshr exact i32 %12, 2, !dbg !9
|
| 14 |
+
%14 = or disjoint i32 %13, %10, !dbg !10
|
| 15 |
+
%15 = icmp slt i32 %14, %4, !dbg !11
|
| 16 |
+
%16 = and i32 %11, 3, !dbg !12
|
| 17 |
+
%17 = sext i32 %14 to i64, !dbg !13
|
| 18 |
+
%.frozen = freeze i64 %2, !dbg !14
|
| 19 |
+
%18 = sdiv i64 %17, %.frozen, !dbg !14
|
| 20 |
+
%19 = mul i64 %18, %.frozen, !dbg !13
|
| 21 |
+
%.decomposed = sub i64 %17, %19, !dbg !13
|
| 22 |
+
%20 = mul i64 %18, %3, !dbg !15
|
| 23 |
+
%.idx = mul nsw i64 %.decomposed, 128000
|
| 24 |
+
%21 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx
|
| 25 |
+
%invariant.gep = getelementptr float, ptr addrspace(1) %21, i64 %20, !dbg !16
|
| 26 |
+
%.fr = freeze i1 %15
|
| 27 |
+
%22 = zext nneg i32 %16 to i64, !dbg !16
|
| 28 |
+
br i1 %.fr, label %.split.us, label %.split.preheader
|
| 29 |
+
|
| 30 |
+
.split.preheader: ; preds = %8
|
| 31 |
+
%invariant.gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %22, !dbg !16
|
| 32 |
+
br label %.split, !dbg !16
|
| 33 |
+
|
| 34 |
+
.split.us: ; preds = %8, %.split.us
|
| 35 |
+
%indvars.iv7 = phi i64 [ %indvars.iv.next8, %.split.us ], [ 0, %8 ]
|
| 36 |
+
%23 = phi i32 [ %44, %.split.us ], [ 2147483647, %8 ]
|
| 37 |
+
%24 = phi float [ %42, %.split.us ], [ 0xFFF0000000000000, %8 ]
|
| 38 |
+
%25 = or disjoint i64 %indvars.iv7, %22, !dbg !17
|
| 39 |
+
%gep.us = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %25, !dbg !18
|
| 40 |
+
%26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
|
| 41 |
+
%27 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep.us, i64 %26, i1 true) #4, !dbg !19
|
| 42 |
+
%28 = bitcast i32 %27 to float, !dbg !19
|
| 43 |
+
%29 = fcmp ogt float %24, %28, !dbg !20
|
| 44 |
+
%30 = fcmp oeq float %24, %28, !dbg !24
|
| 45 |
+
%31 = fcmp uno float %24, 0.000000e+00, !dbg !25
|
| 46 |
+
%32 = fcmp uno float %28, 0.000000e+00, !dbg !26
|
| 47 |
+
%33 = xor i1 %32, true, !dbg !27
|
| 48 |
+
%34 = and i1 %31, %33, !dbg !28
|
| 49 |
+
%35 = or i1 %29, %34, !dbg !29
|
| 50 |
+
%36 = and i1 %31, %32, !dbg !30
|
| 51 |
+
%37 = or i1 %30, %36, !dbg !31
|
| 52 |
+
%38 = sext i32 %23 to i64, !dbg !32
|
| 53 |
+
%39 = icmp sgt i64 %25, %38, !dbg !32
|
| 54 |
+
%40 = and i1 %39, %37, !dbg !33
|
| 55 |
+
%41 = or i1 %35, %40, !dbg !34
|
| 56 |
+
%42 = select i1 %41, float %24, float %28, !dbg !35
|
| 57 |
+
%43 = trunc nuw nsw i64 %25 to i32, !dbg !36
|
| 58 |
+
%44 = select i1 %41, i32 %23, i32 %43, !dbg !36
|
| 59 |
+
%indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 4, !dbg !16
|
| 60 |
+
%45 = icmp samesign ult i64 %indvars.iv7, 31996, !dbg !16
|
| 61 |
+
br i1 %45, label %.split.us, label %.split3.us, !dbg !16
|
| 62 |
+
|
| 63 |
+
.split: ; preds = %.split.preheader, %.split
|
| 64 |
+
%indvars.iv = phi i64 [ 0, %.split.preheader ], [ %indvars.iv.next, %.split ]
|
| 65 |
+
%gep12 = getelementptr float, ptr addrspace(1) %invariant.gep11, i64 %indvars.iv, !dbg !18
|
| 66 |
+
%46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
|
| 67 |
+
%47 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep12, i64 %46, i1 false) #4, !dbg !19
|
| 68 |
+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !16
|
| 69 |
+
%48 = icmp samesign ult i64 %indvars.iv, 31996, !dbg !16
|
| 70 |
+
br i1 %48, label %.split, label %.split3.us, !dbg !16
|
| 71 |
+
|
| 72 |
+
.split3.us: ; preds = %.split, %.split.us
|
| 73 |
+
%.us-phi = phi float [ %42, %.split.us ], [ 0xFFF0000000000000, %.split ], !dbg !9
|
| 74 |
+
%.us-phi4 = phi i32 [ %44, %.split.us ], [ 2147483647, %.split ], !dbg !9
|
| 75 |
+
%49 = and i32 %11, 63, !dbg !9
|
| 76 |
+
%50 = or disjoint i32 %10, %49, !dbg !10
|
| 77 |
+
%51 = icmp slt i32 %50, %4, !dbg !11
|
| 78 |
+
%52 = bitcast float %.us-phi to i32, !dbg !37
|
| 79 |
+
%53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 2, i32 31), !dbg !37
|
| 80 |
+
%54 = bitcast i32 %53 to float, !dbg !37
|
| 81 |
+
%55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %.us-phi4, i32 2, i32 31), !dbg !37
|
| 82 |
+
%56 = fcmp ogt float %.us-phi, %54, !dbg !39
|
| 83 |
+
%57 = fcmp oeq float %.us-phi, %54, !dbg !40
|
| 84 |
+
%58 = fcmp uno float %.us-phi, 0.000000e+00, !dbg !41
|
| 85 |
+
%59 = fcmp uno float %54, 0.000000e+00, !dbg !42
|
| 86 |
+
%60 = xor i1 %59, true, !dbg !43
|
| 87 |
+
%61 = and i1 %58, %60, !dbg !44
|
| 88 |
+
%62 = or i1 %56, %61, !dbg !45
|
| 89 |
+
%63 = and i1 %58, %59, !dbg !46
|
| 90 |
+
%64 = or i1 %57, %63, !dbg !47
|
| 91 |
+
%65 = icmp slt i32 %.us-phi4, %55, !dbg !48
|
| 92 |
+
%66 = and i1 %65, %64, !dbg !49
|
| 93 |
+
%67 = or i1 %62, %66, !dbg !50
|
| 94 |
+
%68 = select i1 %67, float %.us-phi, float %54, !dbg !51
|
| 95 |
+
%69 = select i1 %67, i32 %.us-phi4, i32 %55, !dbg !52
|
| 96 |
+
%70 = bitcast float %68 to i32, !dbg !37
|
| 97 |
+
%71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 1, i32 31), !dbg !37
|
| 98 |
+
%72 = bitcast i32 %71 to float, !dbg !37
|
| 99 |
+
%73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 1, i32 31), !dbg !37
|
| 100 |
+
%74 = fcmp ogt float %68, %72, !dbg !39
|
| 101 |
+
%75 = fcmp oeq float %68, %72, !dbg !40
|
| 102 |
+
%76 = fcmp uno float %68, 0.000000e+00, !dbg !41
|
| 103 |
+
%77 = fcmp uno float %72, 0.000000e+00, !dbg !42
|
| 104 |
+
%78 = xor i1 %77, true, !dbg !43
|
| 105 |
+
%79 = and i1 %76, %78, !dbg !44
|
| 106 |
+
%80 = or i1 %74, %79, !dbg !45
|
| 107 |
+
%81 = and i1 %77, %76, !dbg !46
|
| 108 |
+
%82 = or i1 %75, %81, !dbg !47
|
| 109 |
+
%83 = icmp slt i32 %69, %73, !dbg !48
|
| 110 |
+
%84 = and i1 %83, %82, !dbg !49
|
| 111 |
+
%85 = or i1 %80, %84, !dbg !50
|
| 112 |
+
%86 = select i1 %85, i32 %69, i32 %73, !dbg !52
|
| 113 |
+
%87 = sext i32 %50 to i64, !dbg !53
|
| 114 |
+
%88 = getelementptr i64, ptr addrspace(1) %1, i64 %87, !dbg !53
|
| 115 |
+
%89 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %12, !dbg !54
|
| 116 |
+
%90 = insertelement <1 x i32> poison, i32 %86, i64 0, !dbg !54
|
| 117 |
+
store <1 x i32> %90, ptr addrspace(3) %89, align 4, !dbg !54
|
| 118 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
|
| 119 |
+
%91 = shl nuw nsw i32 %49, 2, !dbg !54
|
| 120 |
+
%92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !54
|
| 121 |
+
%93 = load i32, ptr addrspace(3) %92, align 4, !dbg !54
|
| 122 |
+
%94 = sext i32 %93 to i64, !dbg !54
|
| 123 |
+
%95 = and i32 %11, 192, !dbg !54
|
| 124 |
+
%96 = icmp eq i32 %95, 0, !dbg !54
|
| 125 |
+
%97 = and i1 %96, %51, !dbg !54
|
| 126 |
+
tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %94, ptr addrspace(1) %88, i1 %97) #4, !dbg !54
|
| 127 |
+
ret void, !dbg !55
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 131 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 132 |
+
|
| 133 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 134 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 135 |
+
|
| 136 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 137 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 138 |
+
|
| 139 |
+
; Function Attrs: convergent nocallback nounwind
|
| 140 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 141 |
+
|
| 142 |
+
attributes #0 = { nounwind "nvvm.reqntid"="256" }
|
| 143 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 144 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 145 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 146 |
+
attributes #4 = { nounwind }
|
| 147 |
+
|
| 148 |
+
!llvm.dbg.cu = !{!0}
|
| 149 |
+
!llvm.module.flags = !{!2, !3}
|
| 150 |
+
|
| 151 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 152 |
+
!1 = !DIFile(filename: "c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w")
|
| 153 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 154 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 155 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 156 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 157 |
+
!6 = !{}
|
| 158 |
+
!7 = !DILocation(line: 22, column: 28, scope: !4)
|
| 159 |
+
!8 = !DILocation(line: 22, column: 33, scope: !4)
|
| 160 |
+
!9 = !DILocation(line: 23, column: 44, scope: !4)
|
| 161 |
+
!10 = !DILocation(line: 23, column: 23, scope: !4)
|
| 162 |
+
!11 = !DILocation(line: 24, column: 21, scope: !4)
|
| 163 |
+
!12 = !DILocation(line: 25, column: 37, scope: !4)
|
| 164 |
+
!13 = !DILocation(line: 27, column: 19, scope: !4)
|
| 165 |
+
!14 = !DILocation(line: 28, column: 19, scope: !4)
|
| 166 |
+
!15 = !DILocation(line: 38, column: 56, scope: !4)
|
| 167 |
+
!16 = !DILocation(line: 32, column: 40, scope: !4)
|
| 168 |
+
!17 = !DILocation(line: 33, column: 31, scope: !4)
|
| 169 |
+
!18 = !DILocation(line: 38, column: 34, scope: !4)
|
| 170 |
+
!19 = !DILocation(line: 38, column: 61, scope: !4)
|
| 171 |
+
!20 = !DILocation(line: 144, column: 21, scope: !21, inlinedAt: !23)
|
| 172 |
+
!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0)
|
| 173 |
+
!22 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 174 |
+
!23 = !DILocation(line: 41, column: 38, scope: !4)
|
| 175 |
+
!24 = !DILocation(line: 145, column: 23, scope: !21, inlinedAt: !23)
|
| 176 |
+
!25 = !DILocation(line: 147, column: 29, scope: !21, inlinedAt: !23)
|
| 177 |
+
!26 = !DILocation(line: 148, column: 29, scope: !21, inlinedAt: !23)
|
| 178 |
+
!27 = !DILocation(line: 149, column: 31, scope: !21, inlinedAt: !23)
|
| 179 |
+
!28 = !DILocation(line: 149, column: 27, scope: !21, inlinedAt: !23)
|
| 180 |
+
!29 = !DILocation(line: 149, column: 16, scope: !21, inlinedAt: !23)
|
| 181 |
+
!30 = !DILocation(line: 151, column: 27, scope: !21, inlinedAt: !23)
|
| 182 |
+
!31 = !DILocation(line: 151, column: 17, scope: !21, inlinedAt: !23)
|
| 183 |
+
!32 = !DILocation(line: 154, column: 31, scope: !21, inlinedAt: !23)
|
| 184 |
+
!33 = !DILocation(line: 154, column: 21, scope: !21, inlinedAt: !23)
|
| 185 |
+
!34 = !DILocation(line: 154, column: 12, scope: !21, inlinedAt: !23)
|
| 186 |
+
!35 = !DILocation(line: 155, column: 35, scope: !21, inlinedAt: !23)
|
| 187 |
+
!36 = !DILocation(line: 155, column: 69, scope: !21, inlinedAt: !23)
|
| 188 |
+
!37 = !DILocation(line: 165, column: 42, scope: !21, inlinedAt: !38)
|
| 189 |
+
!38 = !DILocation(line: 45, column: 75, scope: !4)
|
| 190 |
+
!39 = !DILocation(line: 144, column: 21, scope: !21, inlinedAt: !38)
|
| 191 |
+
!40 = !DILocation(line: 145, column: 23, scope: !21, inlinedAt: !38)
|
| 192 |
+
!41 = !DILocation(line: 147, column: 29, scope: !21, inlinedAt: !38)
|
| 193 |
+
!42 = !DILocation(line: 148, column: 29, scope: !21, inlinedAt: !38)
|
| 194 |
+
!43 = !DILocation(line: 149, column: 31, scope: !21, inlinedAt: !38)
|
| 195 |
+
!44 = !DILocation(line: 149, column: 27, scope: !21, inlinedAt: !38)
|
| 196 |
+
!45 = !DILocation(line: 149, column: 16, scope: !21, inlinedAt: !38)
|
| 197 |
+
!46 = !DILocation(line: 151, column: 27, scope: !21, inlinedAt: !38)
|
| 198 |
+
!47 = !DILocation(line: 151, column: 17, scope: !21, inlinedAt: !38)
|
| 199 |
+
!48 = !DILocation(line: 154, column: 31, scope: !21, inlinedAt: !38)
|
| 200 |
+
!49 = !DILocation(line: 154, column: 21, scope: !21, inlinedAt: !38)
|
| 201 |
+
!50 = !DILocation(line: 154, column: 12, scope: !21, inlinedAt: !38)
|
| 202 |
+
!51 = !DILocation(line: 155, column: 35, scope: !21, inlinedAt: !38)
|
| 203 |
+
!52 = !DILocation(line: 155, column: 69, scope: !21, inlinedAt: !38)
|
| 204 |
+
!53 = !DILocation(line: 47, column: 25, scope: !4)
|
| 205 |
+
!54 = !DILocation(line: 47, column: 36, scope: !4)
|
| 206 |
+
!55 = !DILocation(line: 47, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ptx
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_red_fused_argmax_1
|
| 12 |
+
.visible .entry triton_red_fused_argmax_1(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1,
|
| 15 |
+
.param .u64 triton_red_fused_argmax_1_param_2,
|
| 16 |
+
.param .u64 triton_red_fused_argmax_1_param_3,
|
| 17 |
+
.param .u32 triton_red_fused_argmax_1_param_4,
|
| 18 |
+
.param .u32 triton_red_fused_argmax_1_param_5,
|
| 19 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6,
|
| 20 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7
|
| 21 |
+
)
|
| 22 |
+
.reqntid 256
|
| 23 |
+
{
|
| 24 |
+
.reg .pred %p<39>;
|
| 25 |
+
.reg .b32 %r<55>;
|
| 26 |
+
.reg .b64 %rd<54>;
|
| 27 |
+
.loc 1 18 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:18:0
|
| 28 |
+
$L__func_begin0:
|
| 29 |
+
.loc 1 18 0 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:18:0
|
| 30 |
+
|
| 31 |
+
// %bb.0:
|
| 32 |
+
ld.param.b32 %r12, [triton_red_fused_argmax_1_param_4];
|
| 33 |
+
$L__tmp0:
|
| 34 |
+
.loc 1 22 28 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:28
|
| 35 |
+
mov.u32 %r13, %ctaid.x;
|
| 36 |
+
.loc 1 22 33 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:22:33
|
| 37 |
+
shl.b32 %r1, %r13, 6;
|
| 38 |
+
ld.param.b64 %rd20, [triton_red_fused_argmax_1_param_2];
|
| 39 |
+
.loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44
|
| 40 |
+
mov.u32 %r2, %tid.x;
|
| 41 |
+
bfe.u32 %r4, %r2, 2, 6;
|
| 42 |
+
.loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23
|
| 43 |
+
or.b32 %r14, %r4, %r1;
|
| 44 |
+
.loc 1 25 37 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:25:37
|
| 45 |
+
and.b32 %r5, %r2, 3;
|
| 46 |
+
.loc 1 27 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:27:19
|
| 47 |
+
cvt.s64.s32 %rd1, %r14;
|
| 48 |
+
.loc 1 28 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:28:19
|
| 49 |
+
or.b64 %rd21, %rd1, %rd20;
|
| 50 |
+
and.b64 %rd22, %rd21, -4294967296;
|
| 51 |
+
setp.ne.b64 %p1, %rd22, 0;
|
| 52 |
+
cvt.u32.u64 %r50, %rd1;
|
| 53 |
+
@%p1 bra $L__BB0_2;
|
| 54 |
+
bra.uni $L__BB0_1;
|
| 55 |
+
$L__BB0_2:
|
| 56 |
+
div.s64 %rd49, %rd1, %rd20;
|
| 57 |
+
bra.uni $L__BB0_3;
|
| 58 |
+
$L__BB0_1:
|
| 59 |
+
cvt.u32.u64 %r15, %rd20;
|
| 60 |
+
div.u32 %r17, %r50, %r15;
|
| 61 |
+
cvt.u64.u32 %rd49, %r17;
|
| 62 |
+
$L__BB0_3:
|
| 63 |
+
.loc 1 0 19 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:0:19
|
| 64 |
+
ld.param.b64 %rd19, [triton_red_fused_argmax_1_param_3];
|
| 65 |
+
ld.param.b64 %rd18, [triton_red_fused_argmax_1_param_1];
|
| 66 |
+
ld.param.b64 %rd17, [triton_red_fused_argmax_1_param_0];
|
| 67 |
+
and.b32 %r3, %r2, 252;
|
| 68 |
+
.loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40
|
| 69 |
+
cvt.u64.u32 %rd6, %r5;
|
| 70 |
+
setp.ge.s32 %p2, %r50, %r12;
|
| 71 |
+
@%p2 bra $L__BB0_6;
|
| 72 |
+
// %bb.4: // %.split.us.preheader
|
| 73 |
+
shl.b64 %rd35, %rd19, 2;
|
| 74 |
+
mul.lo.s64 %rd36, %rd20, 128000;
|
| 75 |
+
sub.s64 %rd37, %rd35, %rd36;
|
| 76 |
+
mul.lo.s64 %rd38, %rd49, %rd37;
|
| 77 |
+
add.s32 %r26, %r1, %r4;
|
| 78 |
+
mad.wide.s32 %rd39, %r26, 128000, %rd38;
|
| 79 |
+
shl.b64 %rd40, %rd6, 2;
|
| 80 |
+
add.s64 %rd41, %rd39, %rd40;
|
| 81 |
+
add.s64 %rd50, %rd17, %rd41;
|
| 82 |
+
mov.b32 %r53, 0fFF800000;
|
| 83 |
+
mov.b32 %r54, 2147483647;
|
| 84 |
+
mov.b64 %rd51, 0;
|
| 85 |
+
$L__BB0_5: // %.split.us
|
| 86 |
+
// =>This Inner Loop Header: Depth=1
|
| 87 |
+
.loc 1 38 34 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:34
|
| 88 |
+
add.s64 %rd45, %rd6, %rd51;
|
| 89 |
+
.loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61
|
| 90 |
+
// begin inline asm
|
| 91 |
+
mov.u64 %rd42, 0x0;
|
| 92 |
+
createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0;
|
| 93 |
+
// end inline asm
|
| 94 |
+
mov.b32 %r28, 0;
|
| 95 |
+
mov.pred %p5, -1;
|
| 96 |
+
// begin inline asm
|
| 97 |
+
mov.u32 %r27, %r28;
|
| 98 |
+
@%p5 ld.global.L1::evict_first.L2::cache_hint.b32 { %r27 }, [ %rd50 + 0 ], %rd42;
|
| 99 |
+
// end inline asm
|
| 100 |
+
$L__tmp1:
|
| 101 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 102 |
+
setp.gt.f32 %p6, %r53, %r27;
|
| 103 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 104 |
+
setp.eq.f32 %p7, %r53, %r27;
|
| 105 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 106 |
+
setp.nan.f32 %p8, %r53, %r53;
|
| 107 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 108 |
+
setp.nan.f32 %p9, %r27, %r27;
|
| 109 |
+
setp.num.f32 %p10, %r27, %r27;
|
| 110 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 111 |
+
and.pred %p11, %p8, %p10;
|
| 112 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 113 |
+
or.pred %p12, %p6, %p11;
|
| 114 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 115 |
+
and.pred %p13, %p8, %p9;
|
| 116 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 117 |
+
or.pred %p14, %p7, %p13;
|
| 118 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 119 |
+
cvt.s64.s32 %rd46, %r54;
|
| 120 |
+
setp.gt.s64 %p15, %rd45, %rd46;
|
| 121 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 122 |
+
and.pred %p16, %p15, %p14;
|
| 123 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 124 |
+
or.pred %p17, %p12, %p16;
|
| 125 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 126 |
+
selp.f32 %r53, %r53, %r27, %p17;
|
| 127 |
+
cvt.u32.u64 %r29, %rd45;
|
| 128 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:41:38 ]
|
| 129 |
+
selp.b32 %r54, %r54, %r29, %p17;
|
| 130 |
+
$L__tmp2:
|
| 131 |
+
.loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40
|
| 132 |
+
add.s64 %rd11, %rd51, 4;
|
| 133 |
+
add.s64 %rd50, %rd50, 16;
|
| 134 |
+
setp.lt.u64 %p18, %rd51, 31996;
|
| 135 |
+
mov.b64 %rd51, %rd11;
|
| 136 |
+
@%p18 bra $L__BB0_5;
|
| 137 |
+
bra.uni $L__BB0_8;
|
| 138 |
+
$L__BB0_6: // %.split.preheader
|
| 139 |
+
shl.b64 %rd24, %rd19, 2;
|
| 140 |
+
mul.lo.s64 %rd25, %rd20, 128000;
|
| 141 |
+
sub.s64 %rd26, %rd24, %rd25;
|
| 142 |
+
mul.lo.s64 %rd27, %rd49, %rd26;
|
| 143 |
+
add.s32 %r19, %r1, %r4;
|
| 144 |
+
mad.wide.s32 %rd28, %r19, 128000, %rd27;
|
| 145 |
+
shl.b64 %rd29, %rd6, 2;
|
| 146 |
+
add.s64 %rd30, %rd28, %rd29;
|
| 147 |
+
add.s64 %rd52, %rd17, %rd30;
|
| 148 |
+
mov.b64 %rd53, -4;
|
| 149 |
+
$L__BB0_7: // %.split
|
| 150 |
+
// =>This Inner Loop Header: Depth=1
|
| 151 |
+
.loc 1 38 61 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:38:61
|
| 152 |
+
// begin inline asm
|
| 153 |
+
mov.u64 %rd31, 0x0;
|
| 154 |
+
createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0;
|
| 155 |
+
// end inline asm
|
| 156 |
+
mov.b32 %r21, 0;
|
| 157 |
+
mov.pred %p3, 0;
|
| 158 |
+
// begin inline asm
|
| 159 |
+
mov.u32 %r20, %r21;
|
| 160 |
+
@%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r20 }, [ %rd52 + 0 ], %rd31;
|
| 161 |
+
// end inline asm
|
| 162 |
+
.loc 1 32 40 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:32:40
|
| 163 |
+
add.s64 %rd53, %rd53, 4;
|
| 164 |
+
add.s64 %rd52, %rd52, 16;
|
| 165 |
+
setp.lt.u64 %p4, %rd53, 31996;
|
| 166 |
+
mov.b32 %r54, 2147483647;
|
| 167 |
+
mov.b32 %r53, 0fFF800000;
|
| 168 |
+
@%p4 bra $L__BB0_7;
|
| 169 |
+
$L__BB0_8: // %.split3.us
|
| 170 |
+
.loc 1 23 44 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:44
|
| 171 |
+
and.b32 %r30, %r2, 63;
|
| 172 |
+
.loc 1 23 23 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:23:23
|
| 173 |
+
or.b32 %r31, %r1, %r30;
|
| 174 |
+
.loc 1 24 21 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:24:21
|
| 175 |
+
setp.lt.s32 %p20, %r31, %r12;
|
| 176 |
+
$L__tmp3:
|
| 177 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 178 |
+
shfl.sync.bfly.b32 %r32, %r53, 2, 31, -1;
|
| 179 |
+
shfl.sync.bfly.b32 %r33, %r54, 2, 31, -1;
|
| 180 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 181 |
+
setp.gt.f32 %p21, %r53, %r32;
|
| 182 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 183 |
+
setp.eq.f32 %p22, %r53, %r32;
|
| 184 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 185 |
+
setp.nan.f32 %p23, %r53, %r53;
|
| 186 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 187 |
+
setp.nan.f32 %p24, %r32, %r32;
|
| 188 |
+
setp.num.f32 %p25, %r32, %r32;
|
| 189 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 190 |
+
and.pred %p26, %p23, %p25;
|
| 191 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 192 |
+
or.pred %p27, %p21, %p26;
|
| 193 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 194 |
+
and.pred %p28, %p23, %p24;
|
| 195 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 196 |
+
or.pred %p29, %p22, %p28;
|
| 197 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 198 |
+
setp.lt.s32 %p30, %r54, %r33;
|
| 199 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 200 |
+
and.pred %p31, %p30, %p29;
|
| 201 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 202 |
+
or.pred %p32, %p27, %p31;
|
| 203 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 204 |
+
selp.f32 %r34, %r53, %r32, %p32;
|
| 205 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 206 |
+
selp.b32 %r35, %r54, %r33, %p32;
|
| 207 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 208 |
+
shfl.sync.bfly.b32 %r36, %r34, 1, 31, -1;
|
| 209 |
+
shfl.sync.bfly.b32 %r37, %r35, 1, 31, -1;
|
| 210 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 211 |
+
setp.gt.f32 %p33, %r34, %r36;
|
| 212 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 213 |
+
setp.eq.f32 %p34, %r34, %r36;
|
| 214 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 215 |
+
setp.nan.f32 %p35, %r34, %r34;
|
| 216 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 217 |
+
setp.nan.f32 %p36, %r36, %r36;
|
| 218 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 219 |
+
setp.lt.s32 %p37, %r35, %r37;
|
| 220 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:45:75 ]
|
| 221 |
+
selp.b32 %r38, %r35, %r37, %p35;
|
| 222 |
+
selp.b32 %r39, %r38, %r37, %p36;
|
| 223 |
+
selp.b32 %r40, %r35, %r39, %p34;
|
| 224 |
+
selp.b32 %r41, %r40, %r37, %p37;
|
| 225 |
+
selp.b32 %r42, %r41, %r35, %p36;
|
| 226 |
+
selp.b32 %r43, %r42, %r41, %p35;
|
| 227 |
+
selp.b32 %r44, %r35, %r43, %p33;
|
| 228 |
+
$L__tmp4:
|
| 229 |
+
.loc 1 47 25 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:25
|
| 230 |
+
mad.wide.s32 %rd48, %r31, 8, %rd18;
|
| 231 |
+
.loc 1 47 36 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:36
|
| 232 |
+
mov.b32 %r45, global_smem;
|
| 233 |
+
add.s32 %r46, %r45, %r3;
|
| 234 |
+
st.shared.b32 [%r46], %r44;
|
| 235 |
+
bar.sync 0;
|
| 236 |
+
shl.b32 %r47, %r30, 2;
|
| 237 |
+
add.s32 %r48, %r45, %r47;
|
| 238 |
+
ld.shared.s32 %rd47, [%r48];
|
| 239 |
+
and.b32 %r49, %r2, 192;
|
| 240 |
+
setp.eq.b32 %p38, %r49, 0;
|
| 241 |
+
and.pred %p19, %p38, %p20;
|
| 242 |
+
// begin inline asm
|
| 243 |
+
@%p19 st.global.b64 [ %rd48 + 0 ], { %rd47 };
|
| 244 |
+
// end inline asm
|
| 245 |
+
.loc 1 47 4 // c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py:47:4
|
| 246 |
+
ret;
|
| 247 |
+
$L__tmp5:
|
| 248 |
+
$L__func_end0:
|
| 249 |
+
// -- End function
|
| 250 |
+
}
|
| 251 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py"
|
| 252 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
|
| 253 |
+
.section .debug_abbrev
|
| 254 |
+
{
|
| 255 |
+
.b8 1 // Abbreviation Code
|
| 256 |
+
.b8 17 // DW_TAG_compile_unit
|
| 257 |
+
.b8 1 // DW_CHILDREN_yes
|
| 258 |
+
.b8 37 // DW_AT_producer
|
| 259 |
+
.b8 8 // DW_FORM_string
|
| 260 |
+
.b8 19 // DW_AT_language
|
| 261 |
+
.b8 5 // DW_FORM_data2
|
| 262 |
+
.b8 3 // DW_AT_name
|
| 263 |
+
.b8 8 // DW_FORM_string
|
| 264 |
+
.b8 16 // DW_AT_stmt_list
|
| 265 |
+
.b8 6 // DW_FORM_data4
|
| 266 |
+
.b8 27 // DW_AT_comp_dir
|
| 267 |
+
.b8 8 // DW_FORM_string
|
| 268 |
+
.b8 0 // EOM(1)
|
| 269 |
+
.b8 0 // EOM(2)
|
| 270 |
+
.b8 2 // Abbreviation Code
|
| 271 |
+
.b8 46 // DW_TAG_subprogram
|
| 272 |
+
.b8 0 // DW_CHILDREN_no
|
| 273 |
+
.b8 3 // DW_AT_name
|
| 274 |
+
.b8 8 // DW_FORM_string
|
| 275 |
+
.b8 32 // DW_AT_inline
|
| 276 |
+
.b8 11 // DW_FORM_data1
|
| 277 |
+
.b8 0 // EOM(1)
|
| 278 |
+
.b8 0 // EOM(2)
|
| 279 |
+
.b8 3 // Abbreviation Code
|
| 280 |
+
.b8 46 // DW_TAG_subprogram
|
| 281 |
+
.b8 1 // DW_CHILDREN_yes
|
| 282 |
+
.b8 17 // DW_AT_low_pc
|
| 283 |
+
.b8 1 // DW_FORM_addr
|
| 284 |
+
.b8 18 // DW_AT_high_pc
|
| 285 |
+
.b8 1 // DW_FORM_addr
|
| 286 |
+
.b8 49 // DW_AT_abstract_origin
|
| 287 |
+
.b8 19 // DW_FORM_ref4
|
| 288 |
+
.b8 0 // EOM(1)
|
| 289 |
+
.b8 0 // EOM(2)
|
| 290 |
+
.b8 4 // Abbreviation Code
|
| 291 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 292 |
+
.b8 0 // DW_CHILDREN_no
|
| 293 |
+
.b8 49 // DW_AT_abstract_origin
|
| 294 |
+
.b8 19 // DW_FORM_ref4
|
| 295 |
+
.b8 17 // DW_AT_low_pc
|
| 296 |
+
.b8 1 // DW_FORM_addr
|
| 297 |
+
.b8 18 // DW_AT_high_pc
|
| 298 |
+
.b8 1 // DW_FORM_addr
|
| 299 |
+
.b8 88 // DW_AT_call_file
|
| 300 |
+
.b8 11 // DW_FORM_data1
|
| 301 |
+
.b8 89 // DW_AT_call_line
|
| 302 |
+
.b8 11 // DW_FORM_data1
|
| 303 |
+
.b8 87 // DW_AT_call_column
|
| 304 |
+
.b8 11 // DW_FORM_data1
|
| 305 |
+
.b8 0 // EOM(1)
|
| 306 |
+
.b8 0 // EOM(2)
|
| 307 |
+
.b8 0 // EOM(3)
|
| 308 |
+
}
|
| 309 |
+
.section .debug_info
|
| 310 |
+
{
|
| 311 |
+
.b32 234 // Length of Unit
|
| 312 |
+
.b8 2 // DWARF version number
|
| 313 |
+
.b8 0
|
| 314 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 315 |
+
.b8 8 // Address Size (in bytes)
|
| 316 |
+
.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit
|
| 317 |
+
.b8 116 // DW_AT_producer
|
| 318 |
+
.b8 114
|
| 319 |
+
.b8 105
|
| 320 |
+
.b8 116
|
| 321 |
+
.b8 111
|
| 322 |
+
.b8 110
|
| 323 |
+
.b8 0
|
| 324 |
+
.b8 2 // DW_AT_language
|
| 325 |
+
.b8 0
|
| 326 |
+
.b8 99 // DW_AT_name
|
| 327 |
+
.b8 52
|
| 328 |
+
.b8 119
|
| 329 |
+
.b8 100
|
| 330 |
+
.b8 104
|
| 331 |
+
.b8 119
|
| 332 |
+
.b8 108
|
| 333 |
+
.b8 117
|
| 334 |
+
.b8 54
|
| 335 |
+
.b8 121
|
| 336 |
+
.b8 98
|
| 337 |
+
.b8 51
|
| 338 |
+
.b8 119
|
| 339 |
+
.b8 99
|
| 340 |
+
.b8 119
|
| 341 |
+
.b8 97
|
| 342 |
+
.b8 122
|
| 343 |
+
.b8 100
|
| 344 |
+
.b8 110
|
| 345 |
+
.b8 122
|
| 346 |
+
.b8 109
|
| 347 |
+
.b8 103
|
| 348 |
+
.b8 122
|
| 349 |
+
.b8 101
|
| 350 |
+
.b8 119
|
| 351 |
+
.b8 105
|
| 352 |
+
.b8 101
|
| 353 |
+
.b8 109
|
| 354 |
+
.b8 118
|
| 355 |
+
.b8 122
|
| 356 |
+
.b8 110
|
| 357 |
+
.b8 120
|
| 358 |
+
.b8 118
|
| 359 |
+
.b8 114
|
| 360 |
+
.b8 114
|
| 361 |
+
.b8 51
|
| 362 |
+
.b8 53
|
| 363 |
+
.b8 50
|
| 364 |
+
.b8 53
|
| 365 |
+
.b8 101
|
| 366 |
+
.b8 111
|
| 367 |
+
.b8 106
|
| 368 |
+
.b8 117
|
| 369 |
+
.b8 112
|
| 370 |
+
.b8 113
|
| 371 |
+
.b8 106
|
| 372 |
+
.b8 108
|
| 373 |
+
.b8 100
|
| 374 |
+
.b8 111
|
| 375 |
+
.b8 53
|
| 376 |
+
.b8 112
|
| 377 |
+
.b8 116
|
| 378 |
+
.b8 46
|
| 379 |
+
.b8 112
|
| 380 |
+
.b8 121
|
| 381 |
+
.b8 0
|
| 382 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 383 |
+
.b8 47 // DW_AT_comp_dir
|
| 384 |
+
.b8 119
|
| 385 |
+
.b8 111
|
| 386 |
+
.b8 114
|
| 387 |
+
.b8 107
|
| 388 |
+
.b8 115
|
| 389 |
+
.b8 112
|
| 390 |
+
.b8 97
|
| 391 |
+
.b8 99
|
| 392 |
+
.b8 101
|
| 393 |
+
.b8 47
|
| 394 |
+
.b8 104
|
| 395 |
+
.b8 97
|
| 396 |
+
.b8 110
|
| 397 |
+
.b8 114
|
| 398 |
+
.b8 117
|
| 399 |
+
.b8 105
|
| 400 |
+
.b8 47
|
| 401 |
+
.b8 83
|
| 402 |
+
.b8 112
|
| 403 |
+
.b8 101
|
| 404 |
+
.b8 99
|
| 405 |
+
.b8 70
|
| 406 |
+
.b8 111
|
| 407 |
+
.b8 114
|
| 408 |
+
.b8 103
|
| 409 |
+
.b8 101
|
| 410 |
+
.b8 45
|
| 411 |
+
.b8 101
|
| 412 |
+
.b8 120
|
| 413 |
+
.b8 116
|
| 414 |
+
.b8 47
|
| 415 |
+
.b8 99
|
| 416 |
+
.b8 97
|
| 417 |
+
.b8 99
|
| 418 |
+
.b8 104
|
| 419 |
+
.b8 101
|
| 420 |
+
.b8 47
|
| 421 |
+
.b8 99
|
| 422 |
+
.b8 111
|
| 423 |
+
.b8 109
|
| 424 |
+
.b8 112
|
| 425 |
+
.b8 105
|
| 426 |
+
.b8 108
|
| 427 |
+
.b8 101
|
| 428 |
+
.b8 100
|
| 429 |
+
.b8 95
|
| 430 |
+
.b8 107
|
| 431 |
+
.b8 101
|
| 432 |
+
.b8 114
|
| 433 |
+
.b8 110
|
| 434 |
+
.b8 101
|
| 435 |
+
.b8 108
|
| 436 |
+
.b8 115
|
| 437 |
+
.b8 47
|
| 438 |
+
.b8 52
|
| 439 |
+
.b8 119
|
| 440 |
+
.b8 0
|
| 441 |
+
.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram
|
| 442 |
+
.b8 116 // DW_AT_name
|
| 443 |
+
.b8 114
|
| 444 |
+
.b8 105
|
| 445 |
+
.b8 116
|
| 446 |
+
.b8 111
|
| 447 |
+
.b8 110
|
| 448 |
+
.b8 95
|
| 449 |
+
.b8 114
|
| 450 |
+
.b8 101
|
| 451 |
+
.b8 100
|
| 452 |
+
.b8 95
|
| 453 |
+
.b8 102
|
| 454 |
+
.b8 117
|
| 455 |
+
.b8 115
|
| 456 |
+
.b8 101
|
| 457 |
+
.b8 100
|
| 458 |
+
.b8 95
|
| 459 |
+
.b8 97
|
| 460 |
+
.b8 114
|
| 461 |
+
.b8 103
|
| 462 |
+
.b8 109
|
| 463 |
+
.b8 97
|
| 464 |
+
.b8 120
|
| 465 |
+
.b8 95
|
| 466 |
+
.b8 49
|
| 467 |
+
.b8 0
|
| 468 |
+
.b8 1 // DW_AT_inline
|
| 469 |
+
.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram
|
| 470 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 471 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 472 |
+
.b32 139 // DW_AT_abstract_origin
|
| 473 |
+
.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine
|
| 474 |
+
.b32 139 // DW_AT_abstract_origin
|
| 475 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 476 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 477 |
+
.b8 1 // DW_AT_call_file
|
| 478 |
+
.b8 41 // DW_AT_call_line
|
| 479 |
+
.b8 38 // DW_AT_call_column
|
| 480 |
+
.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine
|
| 481 |
+
.b32 139 // DW_AT_abstract_origin
|
| 482 |
+
.b64 $L__tmp3 // DW_AT_low_pc
|
| 483 |
+
.b64 $L__tmp4 // DW_AT_high_pc
|
| 484 |
+
.b8 1 // DW_AT_call_file
|
| 485 |
+
.b8 45 // DW_AT_call_line
|
| 486 |
+
.b8 75 // DW_AT_call_column
|
| 487 |
+
.b8 0 // End Of Children Mark
|
| 488 |
+
.b8 0 // End Of Children Mark
|
| 489 |
+
}
|
| 490 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.source
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0)
|
| 2 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0)
|
| 3 |
+
#loc47 = loc(unknown)
|
| 4 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
|
| 5 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
|
| 6 |
+
#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0)
|
| 7 |
+
#loc72 = loc("in_ptr0"(#loc))
|
| 8 |
+
#loc73 = loc("out_ptr0"(#loc))
|
| 9 |
+
#loc74 = loc("ks0"(#loc))
|
| 10 |
+
#loc75 = loc("ks1"(#loc))
|
| 11 |
+
#loc76 = loc("xnumel"(#loc))
|
| 12 |
+
#loc77 = loc("r0_numel"(#loc))
|
| 13 |
+
#loc106 = loc("a_value"(#loc35))
|
| 14 |
+
#loc107 = loc("a_index"(#loc35))
|
| 15 |
+
#loc108 = loc("b_value"(#loc35))
|
| 16 |
+
#loc109 = loc("b_index"(#loc35))
|
| 17 |
+
#loc122 = loc("x"(#loc55))
|
| 18 |
+
#loc123 = loc("x"(#loc59))
|
| 19 |
+
#loc124 = loc("value"(#loc68))
|
| 20 |
+
#loc125 = loc("index"(#loc68))
|
| 21 |
+
module {
|
| 22 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 23 |
+
%r0_numel_0 = arith.constant 32000 : i32 loc(#loc78)
|
| 24 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc79)
|
| 25 |
+
%xoffset_1 = arith.constant 64 : i32 loc(#loc80)
|
| 26 |
+
%xoffset_2 = arith.constant 64 : i32 loc(#loc80)
|
| 27 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80)
|
| 28 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc81)
|
| 29 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc82)
|
| 30 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc83)
|
| 31 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc83)
|
| 32 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc84)
|
| 33 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc84)
|
| 34 |
+
%r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc85)
|
| 35 |
+
%r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc86)
|
| 36 |
+
%x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc87)
|
| 37 |
+
%x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc87)
|
| 38 |
+
%x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc87)
|
| 39 |
+
%x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc88)
|
| 40 |
+
%x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc88)
|
| 41 |
+
%x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc88)
|
| 42 |
+
%_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89)
|
| 43 |
+
%_tmp2_13 = arith.constant dense<0xFF800000> : tensor<64x4xf32> loc(#loc89)
|
| 44 |
+
%_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90)
|
| 45 |
+
%_tmp2_index_14 = arith.constant dense<2147483647> : tensor<64x4xi32> loc(#loc90)
|
| 46 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc14)
|
| 47 |
+
%c4_i32 = arith.constant 4 : i32 loc(#loc14)
|
| 48 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
|
| 49 |
+
%1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14)
|
| 50 |
+
%2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14)
|
| 51 |
+
%3 = ub.poison : i32 loc(#loc14)
|
| 52 |
+
%_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<64x4xf32>, tensor<64x4xi32>) : i32 {
|
| 53 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc92)
|
| 54 |
+
%r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x4xi32> loc(#loc92)
|
| 55 |
+
%r0_mask = arith.constant dense<32000> : tensor<1x4xi32> loc(#loc93)
|
| 56 |
+
%r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x4xi32> loc(#loc93)
|
| 57 |
+
%tmp0 = arith.constant 32000 : i32 loc(#loc94)
|
| 58 |
+
%tmp0_20 = arith.constant 32000 : i64 loc(#loc94)
|
| 59 |
+
%tmp0_21 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc94)
|
| 60 |
+
%tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<64x1xi64> loc(#loc94)
|
| 61 |
+
%tmp0_23 = arith.extsi %r0_index_18 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc95)
|
| 62 |
+
%tmp0_24 = tt.broadcast %tmp0_23 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc95)
|
| 63 |
+
%tmp0_25 = tt.broadcast %tmp0_22 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc95)
|
| 64 |
+
%tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<64x4xi64> loc(#loc95)
|
| 65 |
+
%tmp0_27 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96)
|
| 66 |
+
%tmp0_28 = arith.muli %tmp0_27, %x1_12 : tensor<64x1xi64> loc(#loc96)
|
| 67 |
+
%tmp0_29 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc97)
|
| 68 |
+
%tmp0_30 = arith.addi %tmp0_26, %tmp0_29 : tensor<64x4xi64> loc(#loc97)
|
| 69 |
+
%tmp0_31 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>> loc(#loc98)
|
| 70 |
+
%tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<64x4x!tt.ptr<f32>>, tensor<64x4xi64> loc(#loc98)
|
| 71 |
+
%tmp0_33 = tt.broadcast %r0_mask_19 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc99)
|
| 72 |
+
%tmp0_34 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc99)
|
| 73 |
+
%tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<64x4xi1> loc(#loc99)
|
| 74 |
+
%tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc100)
|
| 75 |
+
%tmp0_37 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc100)
|
| 76 |
+
%tmp0_38 = tt.load %tmp0_32, %tmp0_35, %tmp0_37 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<f32>> loc(#loc100)
|
| 77 |
+
%8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_4S_i32S64_4S_fp32S64_4S_i32S1_4S__(%_tmp2_16, %_tmp2_index_17, %tmp0_38, %r0_index_18) : (tensor<64x4xf32>, tensor<64x4xi32>, tensor<64x4xf32>, tensor<1x4xi32>) -> (tensor<64x4xf32>, tensor<64x4xi32>) loc(#loc24)
|
| 78 |
+
%_tmp2_39 = tt.broadcast %r0_mask_19 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc101)
|
| 79 |
+
%_tmp2_40 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc101)
|
| 80 |
+
%_tmp2_41 = arith.andi %_tmp2_39, %_tmp2_40 : tensor<64x4xi1> loc(#loc101)
|
| 81 |
+
%_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_16 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc102)
|
| 82 |
+
%_tmp2_index_43 = tt.broadcast %r0_mask_19 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc103)
|
| 83 |
+
%_tmp2_index_44 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc103)
|
| 84 |
+
%_tmp2_index_45 = arith.andi %_tmp2_index_43, %_tmp2_index_44 : tensor<64x4xi1> loc(#loc103)
|
| 85 |
+
%_tmp2_index_46 = arith.select %_tmp2_index_45, %8#1, %_tmp2_index_17 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc104)
|
| 86 |
+
scf.yield %_tmp2_42, %_tmp2_index_46 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc29)
|
| 87 |
+
} loc(#loc126)
|
| 88 |
+
%4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_4S_i32S64_4S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc30)
|
| 89 |
+
%tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc105)
|
| 90 |
+
%5 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<64x1x!tt.ptr<i64>> loc(#loc32)
|
| 91 |
+
%6 = tt.addptr %5, %xindex_6 : tensor<64x1x!tt.ptr<i64>>, tensor<64x1xi32> loc(#loc32)
|
| 92 |
+
%7 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc33)
|
| 93 |
+
tt.store %6, %7, %xmask_7 : tensor<64x1x!tt.ptr<i64>> loc(#loc33)
|
| 94 |
+
tt.return loc(#loc34)
|
| 95 |
+
} loc(#loc)
|
| 96 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_4S_i32S64_4S_fp32S64_4S_i32S1_4S__(%a_value: tensor<64x4xf32> loc("a_value"(#loc35)), %a_index: tensor<64x4xi32> loc("a_index"(#loc35)), %b_value: tensor<64x4xf32> loc("b_value"(#loc35)), %b_index: tensor<1x4xi32> loc("b_index"(#loc35))) -> (tensor<64x4xf32>, tensor<64x4xi32>) attributes {noinline = false} {
|
| 97 |
+
%mask = arith.cmpf ogt, %a_value, %b_value : tensor<64x4xf32> loc(#loc127)
|
| 98 |
+
%equal = arith.cmpf oeq, %a_value, %b_value : tensor<64x4xf32> loc(#loc128)
|
| 99 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_4S__(%a_value) : (tensor<64x4xf32>) -> i1 loc(#loc38)
|
| 100 |
+
%1:2 = scf.if %0 -> (tensor<64x4xi1>, tensor<64x4xi1>) {
|
| 101 |
+
%a_isnan = arith.cmpf une, %a_value, %a_value : tensor<64x4xf32> loc(#loc112)
|
| 102 |
+
%b_isnan = arith.cmpf une, %b_value, %b_value : tensor<64x4xf32> loc(#loc113)
|
| 103 |
+
%mask_4 = arith.constant true loc(#loc114)
|
| 104 |
+
%mask_5 = arith.constant dense<true> : tensor<64x4xi1> loc(#loc114)
|
| 105 |
+
%mask_6 = arith.xori %b_isnan, %mask_5 : tensor<64x4xi1> loc(#loc114)
|
| 106 |
+
%mask_7 = arith.andi %a_isnan, %mask_6 : tensor<64x4xi1> loc(#loc115)
|
| 107 |
+
%mask_8 = arith.ori %mask, %mask_7 : tensor<64x4xi1> loc(#loc129)
|
| 108 |
+
%equal_9 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1> loc(#loc117)
|
| 109 |
+
%equal_10 = arith.ori %equal, %equal_9 : tensor<64x4xi1> loc(#loc130)
|
| 110 |
+
scf.yield %mask_8, %equal_10 : tensor<64x4xi1>, tensor<64x4xi1> loc(#loc130)
|
| 111 |
+
} else {
|
| 112 |
+
scf.yield %mask, %equal : tensor<64x4xi1>, tensor<64x4xi1> loc(#loc47)
|
| 113 |
+
} loc(#loc39)
|
| 114 |
+
%mask_0 = tt.broadcast %b_index : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc119)
|
| 115 |
+
%mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<64x4xi32> loc(#loc119)
|
| 116 |
+
%mask_2 = arith.andi %1#1, %mask_1 : tensor<64x4xi1> loc(#loc120)
|
| 117 |
+
%mask_3 = arith.ori %1#0, %mask_2 : tensor<64x4xi1> loc(#loc121)
|
| 118 |
+
%2 = arith.select %mask_3, %a_value, %b_value : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc51)
|
| 119 |
+
%3 = tt.broadcast %b_index : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc52)
|
| 120 |
+
%4 = arith.select %mask_3, %a_index, %3 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc52)
|
| 121 |
+
tt.return %2, %4 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc53)
|
| 122 |
+
^bb1: // no predecessors
|
| 123 |
+
%5 = ub.poison : tensor<64x4xf32> loc(#loc54)
|
| 124 |
+
%6 = ub.poison : tensor<64x4xi32> loc(#loc54)
|
| 125 |
+
tt.return %5, %6 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc54)
|
| 126 |
+
} loc(#loc35)
|
| 127 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_4S__(%x: tensor<64x4xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} {
|
| 128 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_4S__(%x) : (tensor<64x4xf32>) -> tensor<64x4xf32> loc(#loc56)
|
| 129 |
+
%true = arith.constant true loc(#loc57)
|
| 130 |
+
tt.return %true : i1 loc(#loc57)
|
| 131 |
+
^bb1: // no predecessors
|
| 132 |
+
%1 = ub.poison : i1 loc(#loc58)
|
| 133 |
+
tt.return %1 : i1 loc(#loc58)
|
| 134 |
+
} loc(#loc55)
|
| 135 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_4S__(%x: tensor<64x4xf32> loc("x"(#loc59))) -> tensor<64x4xf32> attributes {noinline = false} {
|
| 136 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60)
|
| 137 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61)
|
| 138 |
+
%2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61)
|
| 139 |
+
%3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<64x4xf32> loc(#loc61)
|
| 140 |
+
%4 = arith.addf %x, %3 : tensor<64x4xf32> loc(#loc61)
|
| 141 |
+
tt.return %4 : tensor<64x4xf32> loc(#loc62)
|
| 142 |
+
^bb1: // no predecessors
|
| 143 |
+
%5 = ub.poison : tensor<64x4xf32> loc(#loc63)
|
| 144 |
+
tt.return %5 : tensor<64x4xf32> loc(#loc63)
|
| 145 |
+
} loc(#loc59)
|
| 146 |
+
tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
|
| 147 |
+
%false = arith.constant false loc(#loc65)
|
| 148 |
+
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc65)
|
| 149 |
+
tt.return %cst : tensor<1xi1> loc(#loc66)
|
| 150 |
+
^bb1: // no predecessors
|
| 151 |
+
%0 = ub.poison : tensor<1xi1> loc(#loc67)
|
| 152 |
+
tt.return %0 : tensor<1xi1> loc(#loc67)
|
| 153 |
+
} loc(#loc64)
|
| 154 |
+
tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_4S_i32S64_4S__(2,)cconstexpr_1_"(%value: tensor<64x4xf32> loc("value"(#loc68)), %index: tensor<64x4xi32> loc("index"(#loc68))) -> (tensor<64xf32>, tensor<64xi32>) attributes {noinline = false} {
|
| 155 |
+
%0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({
|
| 156 |
+
^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)):
|
| 157 |
+
%3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69)
|
| 158 |
+
tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69)
|
| 159 |
+
}) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc69)
|
| 160 |
+
tt.return %0#0, %0#1 : tensor<64xf32>, tensor<64xi32> loc(#loc70)
|
| 161 |
+
^bb1: // no predecessors
|
| 162 |
+
%1 = ub.poison : tensor<64xf32> loc(#loc71)
|
| 163 |
+
%2 = ub.poison : tensor<64xi32> loc(#loc71)
|
| 164 |
+
tt.return %1, %2 : tensor<64xf32>, tensor<64xi32> loc(#loc71)
|
| 165 |
+
} loc(#loc68)
|
| 166 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} {
|
| 167 |
+
%mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127)
|
| 168 |
+
%equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128)
|
| 169 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38)
|
| 170 |
+
%1:2 = scf.if %0 -> (i1, i1) {
|
| 171 |
+
%a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112)
|
| 172 |
+
%b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113)
|
| 173 |
+
%mask_3 = arith.constant true loc(#loc114)
|
| 174 |
+
%mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114)
|
| 175 |
+
%mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115)
|
| 176 |
+
%mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129)
|
| 177 |
+
%equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117)
|
| 178 |
+
%equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130)
|
| 179 |
+
scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130)
|
| 180 |
+
} else {
|
| 181 |
+
scf.yield %mask, %equal : i1, i1 loc(#loc47)
|
| 182 |
+
} loc(#loc39)
|
| 183 |
+
%mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119)
|
| 184 |
+
%mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120)
|
| 185 |
+
%mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121)
|
| 186 |
+
%2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51)
|
| 187 |
+
%3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52)
|
| 188 |
+
tt.return %2, %3 : f32, i32 loc(#loc53)
|
| 189 |
+
^bb1: // no predecessors
|
| 190 |
+
%4 = ub.poison : f32 loc(#loc54)
|
| 191 |
+
%5 = ub.poison : i32 loc(#loc54)
|
| 192 |
+
tt.return %4, %5 : f32, i32 loc(#loc54)
|
| 193 |
+
} loc(#loc35)
|
| 194 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} {
|
| 195 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56)
|
| 196 |
+
%true = arith.constant true loc(#loc57)
|
| 197 |
+
tt.return %true : i1 loc(#loc57)
|
| 198 |
+
^bb1: // no predecessors
|
| 199 |
+
%1 = ub.poison : i1 loc(#loc58)
|
| 200 |
+
tt.return %1 : i1 loc(#loc58)
|
| 201 |
+
} loc(#loc55)
|
| 202 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} {
|
| 203 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60)
|
| 204 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61)
|
| 205 |
+
%2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61)
|
| 206 |
+
%3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61)
|
| 207 |
+
tt.return %3 : tensor<1xf32> loc(#loc62)
|
| 208 |
+
^bb1: // no predecessors
|
| 209 |
+
%4 = ub.poison : tensor<1xf32> loc(#loc63)
|
| 210 |
+
tt.return %4 : tensor<1xf32> loc(#loc63)
|
| 211 |
+
} loc(#loc59)
|
| 212 |
+
} loc(#loc)
|
| 213 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":19:15)
|
| 214 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28)
|
| 215 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33)
|
| 216 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36)
|
| 217 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44)
|
| 218 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23)
|
| 219 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21)
|
| 220 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27)
|
| 221 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37)
|
| 222 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19)
|
| 223 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19)
|
| 224 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55)
|
| 225 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58)
|
| 226 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40)
|
| 227 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31)
|
| 228 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29)
|
| 229 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47)
|
| 230 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41)
|
| 231 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56)
|
| 232 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52)
|
| 233 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34)
|
| 234 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71)
|
| 235 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61)
|
| 236 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38)
|
| 237 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:35)
|
| 238 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54)
|
| 239 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:41)
|
| 240 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66)
|
| 241 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8)
|
| 242 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75)
|
| 243 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20)
|
| 244 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25)
|
| 245 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36)
|
| 246 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4)
|
| 247 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 248 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 249 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19)
|
| 250 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7)
|
| 251 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 252 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 253 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 254 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 255 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 256 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 257 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 258 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 259 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 260 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 261 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 262 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 263 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11)
|
| 264 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4)
|
| 265 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
|
| 266 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
|
| 267 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
|
| 268 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
|
| 269 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
|
| 270 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
|
| 271 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
|
| 272 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
|
| 273 |
+
#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
|
| 274 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
|
| 275 |
+
#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
|
| 276 |
+
#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 277 |
+
#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11)
|
| 278 |
+
#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4)
|
| 279 |
+
#loc78 = loc("r0_numel"(#loc1))
|
| 280 |
+
#loc79 = loc("xoffset"(#loc2))
|
| 281 |
+
#loc80 = loc("xoffset"(#loc3))
|
| 282 |
+
#loc81 = loc("xindex"(#loc4))
|
| 283 |
+
#loc82 = loc("xindex"(#loc5))
|
| 284 |
+
#loc83 = loc("xindex"(#loc6))
|
| 285 |
+
#loc84 = loc("xmask"(#loc7))
|
| 286 |
+
#loc85 = loc("r0_base"(#loc8))
|
| 287 |
+
#loc86 = loc("r0_base"(#loc9))
|
| 288 |
+
#loc87 = loc("x0"(#loc10))
|
| 289 |
+
#loc88 = loc("x1"(#loc11))
|
| 290 |
+
#loc89 = loc("_tmp2"(#loc12))
|
| 291 |
+
#loc90 = loc("_tmp2_index"(#loc13))
|
| 292 |
+
#loc91 = loc("_tmp2"(#loc14))
|
| 293 |
+
#loc92 = loc("r0_index"(#loc15))
|
| 294 |
+
#loc93 = loc("r0_mask"(#loc16))
|
| 295 |
+
#loc94 = loc("tmp0"(#loc17))
|
| 296 |
+
#loc95 = loc("tmp0"(#loc18))
|
| 297 |
+
#loc96 = loc("tmp0"(#loc19))
|
| 298 |
+
#loc97 = loc("tmp0"(#loc20))
|
| 299 |
+
#loc98 = loc("tmp0"(#loc21))
|
| 300 |
+
#loc99 = loc("tmp0"(#loc22))
|
| 301 |
+
#loc100 = loc("tmp0"(#loc23))
|
| 302 |
+
#loc101 = loc("_tmp2"(#loc25))
|
| 303 |
+
#loc102 = loc("_tmp2"(#loc26))
|
| 304 |
+
#loc103 = loc("_tmp2_index"(#loc27))
|
| 305 |
+
#loc104 = loc("_tmp2_index"(#loc28))
|
| 306 |
+
#loc105 = loc("tmp2"(#loc31))
|
| 307 |
+
#loc110 = loc("mask"(#loc36))
|
| 308 |
+
#loc111 = loc("equal"(#loc37))
|
| 309 |
+
#loc112 = loc("a_isnan"(#loc40))
|
| 310 |
+
#loc113 = loc("b_isnan"(#loc41))
|
| 311 |
+
#loc114 = loc("mask"(#loc42))
|
| 312 |
+
#loc115 = loc("mask"(#loc43))
|
| 313 |
+
#loc116 = loc("mask"(#loc44))
|
| 314 |
+
#loc117 = loc("equal"(#loc45))
|
| 315 |
+
#loc118 = loc("equal"(#loc46))
|
| 316 |
+
#loc119 = loc("mask"(#loc48))
|
| 317 |
+
#loc120 = loc("mask"(#loc49))
|
| 318 |
+
#loc121 = loc("mask"(#loc50))
|
| 319 |
+
#loc126 = loc("_tmp2_index"(#loc91))
|
| 320 |
+
#loc127 = loc("mask"(#loc110))
|
| 321 |
+
#loc128 = loc("equal"(#loc111))
|
| 322 |
+
#loc129 = loc("mask"(#loc116))
|
| 323 |
+
#loc130 = loc("equal"(#loc118))
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttgir
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75)
|
| 6 |
+
#loc44 = loc("in_ptr0"(#loc))
|
| 7 |
+
#loc45 = loc("out_ptr0"(#loc))
|
| 8 |
+
#loc46 = loc("ks0"(#loc))
|
| 9 |
+
#loc47 = loc("ks1"(#loc))
|
| 10 |
+
#loc48 = loc("xnumel"(#loc))
|
| 11 |
+
#loc49 = loc("r0_numel"(#loc))
|
| 12 |
+
#loc85 = loc(callsite(#loc1 at #loc39))
|
| 13 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 14 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 15 |
+
%cst = arith.constant dense<32000> : tensor<64x1xi64, #blocked> loc(#loc1)
|
| 16 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1)
|
| 17 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 18 |
+
%c32000_i32 = arith.constant 32000 : i32 loc(#loc1)
|
| 19 |
+
%c4_i32 = arith.constant 4 : i32 loc(#loc1)
|
| 20 |
+
%cst_1 = arith.constant dense<true> : tensor<64x4xi1, #blocked> loc(#loc1)
|
| 21 |
+
%true = arith.constant true loc(#loc1)
|
| 22 |
+
%cst_2 = arith.constant dense<32000> : tensor<1x4xi32, #blocked> loc(#loc1)
|
| 23 |
+
%cst_3 = arith.constant dense<2147483647> : tensor<64x4xi32, #blocked> loc(#loc1)
|
| 24 |
+
%cst_4 = arith.constant dense<0xFF800000> : tensor<64x4xf32, #blocked> loc(#loc1)
|
| 25 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 26 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc50)
|
| 27 |
+
%xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc51)
|
| 28 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc52)
|
| 29 |
+
%xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc52)
|
| 30 |
+
%xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc52)
|
| 31 |
+
%xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc52)
|
| 32 |
+
%xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc53)
|
| 33 |
+
%xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc53)
|
| 34 |
+
%xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc53)
|
| 35 |
+
%xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc53)
|
| 36 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc54)
|
| 37 |
+
%xmask_13 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc54)
|
| 38 |
+
%xmask_14 = arith.cmpi slt, %xindex_11, %xmask : tensor<64x1xi32, #blocked> loc(#loc54)
|
| 39 |
+
%xmask_15 = arith.cmpi slt, %xindex_12, %xmask_13 : tensor<64x1xi32, #blocked1> loc(#loc54)
|
| 40 |
+
%r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc55)
|
| 41 |
+
%r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc55)
|
| 42 |
+
%x0 = arith.extsi %xindex_11 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc56)
|
| 43 |
+
%x0_17 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc56)
|
| 44 |
+
%x0_18 = arith.remsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc56)
|
| 45 |
+
%x1 = arith.divsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc57)
|
| 46 |
+
%tmp0 = arith.muli %x0_18, %cst : tensor<64x1xi64, #blocked> loc(#loc58)
|
| 47 |
+
%tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc59)
|
| 48 |
+
%tmp0_20 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc60)
|
| 49 |
+
%tmp0_21 = arith.muli %tmp0_20, %x1 : tensor<64x1xi64, #blocked> loc(#loc60)
|
| 50 |
+
%tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc61)
|
| 51 |
+
%tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>, #blocked> loc(#loc62)
|
| 52 |
+
%tmp0_24 = tt.broadcast %xmask_14 : tensor<64x1xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc63)
|
| 53 |
+
%_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c4_i32 iter_args(%_tmp2 = %cst_4, %_tmp2_index_25 = %cst_3) -> (tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked>) : i32 {
|
| 54 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc65)
|
| 55 |
+
%r0_index_26 = arith.addi %r0_index, %r0_base_16 : tensor<1x4xi32, #blocked> loc(#loc65)
|
| 56 |
+
%r0_mask = arith.cmpi slt, %r0_index_26, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc66)
|
| 57 |
+
%tmp0_27 = arith.extsi %r0_index_26 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc59)
|
| 58 |
+
%tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x4xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc59)
|
| 59 |
+
%tmp0_29 = arith.addi %tmp0_28, %tmp0_19 : tensor<64x4xi64, #blocked> loc(#loc59)
|
| 60 |
+
%tmp0_30 = arith.addi %tmp0_29, %tmp0_22 : tensor<64x4xi64, #blocked> loc(#loc61)
|
| 61 |
+
%tmp0_31 = tt.addptr %tmp0_23, %tmp0_30 : tensor<64x4x!tt.ptr<f32>, #blocked>, tensor<64x4xi64, #blocked> loc(#loc62)
|
| 62 |
+
%tmp0_32 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc63)
|
| 63 |
+
%tmp0_33 = arith.andi %tmp0_32, %tmp0_24 : tensor<64x4xi1, #blocked> loc(#loc63)
|
| 64 |
+
%tmp0_34 = tt.load %tmp0_31, %tmp0_33, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<f32>, #blocked> loc(#loc67)
|
| 65 |
+
%mask = arith.cmpf ogt, %_tmp2, %tmp0_34 : tensor<64x4xf32, #blocked> loc(#loc110)
|
| 66 |
+
%equal = arith.cmpf oeq, %_tmp2, %tmp0_34 : tensor<64x4xf32, #blocked> loc(#loc111)
|
| 67 |
+
%a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<64x4xf32, #blocked> loc(#loc90)
|
| 68 |
+
%b_isnan = arith.cmpf une, %tmp0_34, %tmp0_34 : tensor<64x4xf32, #blocked> loc(#loc91)
|
| 69 |
+
%mask_35 = arith.xori %b_isnan, %cst_1 : tensor<64x4xi1, #blocked> loc(#loc92)
|
| 70 |
+
%mask_36 = arith.andi %a_isnan, %mask_35 : tensor<64x4xi1, #blocked> loc(#loc93)
|
| 71 |
+
%mask_37 = arith.ori %mask, %mask_36 : tensor<64x4xi1, #blocked> loc(#loc112)
|
| 72 |
+
%equal_38 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1, #blocked> loc(#loc95)
|
| 73 |
+
%equal_39 = arith.ori %equal, %equal_38 : tensor<64x4xi1, #blocked> loc(#loc113)
|
| 74 |
+
%mask_40 = tt.broadcast %r0_index_26 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc97)
|
| 75 |
+
%mask_41 = arith.cmpi slt, %_tmp2_index_25, %mask_40 : tensor<64x4xi32, #blocked> loc(#loc97)
|
| 76 |
+
%mask_42 = arith.andi %equal_39, %mask_41 : tensor<64x4xi1, #blocked> loc(#loc98)
|
| 77 |
+
%mask_43 = arith.ori %mask_37, %mask_42 : tensor<64x4xi1, #blocked> loc(#loc99)
|
| 78 |
+
%5 = arith.select %mask_43, %_tmp2, %tmp0_34 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc80)
|
| 79 |
+
%6 = arith.select %mask_43, %_tmp2_index_25, %mask_40 : tensor<64x4xi1, #blocked>, tensor<64x4xi32, #blocked> loc(#loc81)
|
| 80 |
+
%_tmp2_44 = arith.select %tmp0_33, %5, %_tmp2 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc82)
|
| 81 |
+
%_tmp2_index_45 = arith.select %tmp0_33, %6, %_tmp2_index_25 : tensor<64x4xi1, #blocked>, tensor<64x4xi32, #blocked> loc(#loc83)
|
| 82 |
+
scf.yield %_tmp2_44, %_tmp2_index_45 : tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked> loc(#loc37)
|
| 83 |
+
} loc(#loc87)
|
| 84 |
+
%0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({
|
| 85 |
+
^bb0(%arg6: f32 loc(callsite(#loc1 at #loc39)), %arg7: i32 loc(callsite(#loc1 at #loc39)), %arg8: f32 loc(callsite(#loc1 at #loc39)), %arg9: i32 loc(callsite(#loc1 at #loc39))):
|
| 86 |
+
%mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc114)
|
| 87 |
+
%equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc115)
|
| 88 |
+
%a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc100)
|
| 89 |
+
%b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc101)
|
| 90 |
+
%mask_25 = arith.xori %b_isnan, %true : i1 loc(#loc102)
|
| 91 |
+
%mask_26 = arith.andi %a_isnan, %mask_25 : i1 loc(#loc103)
|
| 92 |
+
%mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc116)
|
| 93 |
+
%equal_28 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc104)
|
| 94 |
+
%equal_29 = arith.ori %equal, %equal_28 : i1 loc(#loc117)
|
| 95 |
+
%mask_30 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc105)
|
| 96 |
+
%mask_31 = arith.andi %equal_29, %mask_30 : i1 loc(#loc106)
|
| 97 |
+
%mask_32 = arith.ori %mask_27, %mask_31 : i1 loc(#loc107)
|
| 98 |
+
%5 = arith.select %mask_32, %arg6, %arg8 : f32 loc(#loc108)
|
| 99 |
+
%6 = arith.select %mask_32, %arg7, %arg9 : i32 loc(#loc109)
|
| 100 |
+
tt.reduce.return %5, %6 : f32, i32 loc(#loc84)
|
| 101 |
+
}) : (tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked>) -> (tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc84)
|
| 102 |
+
%tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc86)
|
| 103 |
+
%1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<64x1x!tt.ptr<i64>, #blocked1> loc(#loc41)
|
| 104 |
+
%2 = tt.addptr %1, %xindex_12 : tensor<64x1x!tt.ptr<i64>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc41)
|
| 105 |
+
%3 = ttg.convert_layout %tmp2 : tensor<64x1xi32, #blocked> -> tensor<64x1xi32, #blocked1> loc(#loc42)
|
| 106 |
+
%4 = arith.extsi %3 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc42)
|
| 107 |
+
tt.store %2, %4, %xmask_15 : tensor<64x1x!tt.ptr<i64>, #blocked1> loc(#loc42)
|
| 108 |
+
tt.return loc(#loc43)
|
| 109 |
+
} loc(#loc)
|
| 110 |
+
} loc(#loc)
|
| 111 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28)
|
| 112 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33)
|
| 113 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44)
|
| 114 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23)
|
| 115 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21)
|
| 116 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37)
|
| 117 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19)
|
| 118 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19)
|
| 119 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47)
|
| 120 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41)
|
| 121 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56)
|
| 122 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52)
|
| 123 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34)
|
| 124 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71)
|
| 125 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40)
|
| 126 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31)
|
| 127 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29)
|
| 128 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61)
|
| 129 |
+
#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 130 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38)
|
| 131 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 132 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 133 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 134 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 135 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 136 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 137 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 138 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 139 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 140 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 141 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 142 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 143 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 144 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54)
|
| 145 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66)
|
| 146 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8)
|
| 147 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 148 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20)
|
| 149 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25)
|
| 150 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36)
|
| 151 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4)
|
| 152 |
+
#loc50 = loc("xoffset"(#loc2))
|
| 153 |
+
#loc51 = loc("xoffset"(#loc3))
|
| 154 |
+
#loc52 = loc("xindex"(#loc4))
|
| 155 |
+
#loc53 = loc("xindex"(#loc5))
|
| 156 |
+
#loc54 = loc("xmask"(#loc6))
|
| 157 |
+
#loc55 = loc("r0_base"(#loc7))
|
| 158 |
+
#loc56 = loc("x0"(#loc8))
|
| 159 |
+
#loc57 = loc("x1"(#loc9))
|
| 160 |
+
#loc58 = loc("tmp0"(#loc10))
|
| 161 |
+
#loc59 = loc("tmp0"(#loc11))
|
| 162 |
+
#loc60 = loc("tmp0"(#loc12))
|
| 163 |
+
#loc61 = loc("tmp0"(#loc13))
|
| 164 |
+
#loc62 = loc("tmp0"(#loc14))
|
| 165 |
+
#loc63 = loc("tmp0"(#loc15))
|
| 166 |
+
#loc64 = loc("_tmp2"(#loc16))
|
| 167 |
+
#loc65 = loc("r0_index"(#loc17))
|
| 168 |
+
#loc66 = loc("r0_mask"(#loc18))
|
| 169 |
+
#loc67 = loc("tmp0"(#loc19))
|
| 170 |
+
#loc68 = loc("mask"(#loc20))
|
| 171 |
+
#loc69 = loc("equal"(#loc22))
|
| 172 |
+
#loc70 = loc("a_isnan"(#loc23))
|
| 173 |
+
#loc71 = loc("b_isnan"(#loc24))
|
| 174 |
+
#loc72 = loc("mask"(#loc25))
|
| 175 |
+
#loc73 = loc("mask"(#loc26))
|
| 176 |
+
#loc74 = loc("mask"(#loc27))
|
| 177 |
+
#loc75 = loc("equal"(#loc28))
|
| 178 |
+
#loc76 = loc("equal"(#loc29))
|
| 179 |
+
#loc77 = loc("mask"(#loc30))
|
| 180 |
+
#loc78 = loc("mask"(#loc31))
|
| 181 |
+
#loc79 = loc("mask"(#loc32))
|
| 182 |
+
#loc80 = loc(callsite(#loc33 at #loc21))
|
| 183 |
+
#loc81 = loc(callsite(#loc34 at #loc21))
|
| 184 |
+
#loc82 = loc("_tmp2"(#loc35))
|
| 185 |
+
#loc83 = loc("_tmp2_index"(#loc36))
|
| 186 |
+
#loc84 = loc(callsite(#loc38 at #loc39))
|
| 187 |
+
#loc86 = loc("tmp2"(#loc40))
|
| 188 |
+
#loc87 = loc("_tmp2_index"(#loc64))
|
| 189 |
+
#loc88 = loc("mask"(#loc68))
|
| 190 |
+
#loc89 = loc("equal"(#loc69))
|
| 191 |
+
#loc90 = loc(callsite(#loc70 at #loc21))
|
| 192 |
+
#loc91 = loc(callsite(#loc71 at #loc21))
|
| 193 |
+
#loc92 = loc(callsite(#loc72 at #loc21))
|
| 194 |
+
#loc93 = loc(callsite(#loc73 at #loc21))
|
| 195 |
+
#loc94 = loc("mask"(#loc74))
|
| 196 |
+
#loc95 = loc(callsite(#loc75 at #loc21))
|
| 197 |
+
#loc96 = loc("equal"(#loc76))
|
| 198 |
+
#loc97 = loc(callsite(#loc77 at #loc21))
|
| 199 |
+
#loc98 = loc(callsite(#loc78 at #loc21))
|
| 200 |
+
#loc99 = loc(callsite(#loc79 at #loc21))
|
| 201 |
+
#loc100 = loc(callsite(#loc70 at #loc84))
|
| 202 |
+
#loc101 = loc(callsite(#loc71 at #loc84))
|
| 203 |
+
#loc102 = loc(callsite(#loc72 at #loc84))
|
| 204 |
+
#loc103 = loc(callsite(#loc73 at #loc84))
|
| 205 |
+
#loc104 = loc(callsite(#loc75 at #loc84))
|
| 206 |
+
#loc105 = loc(callsite(#loc77 at #loc84))
|
| 207 |
+
#loc106 = loc(callsite(#loc78 at #loc84))
|
| 208 |
+
#loc107 = loc(callsite(#loc79 at #loc84))
|
| 209 |
+
#loc108 = loc(callsite(#loc33 at #loc84))
|
| 210 |
+
#loc109 = loc(callsite(#loc34 at #loc84))
|
| 211 |
+
#loc110 = loc(callsite(#loc88 at #loc21))
|
| 212 |
+
#loc111 = loc(callsite(#loc89 at #loc21))
|
| 213 |
+
#loc112 = loc(callsite(#loc94 at #loc21))
|
| 214 |
+
#loc113 = loc(callsite(#loc96 at #loc21))
|
| 215 |
+
#loc114 = loc(callsite(#loc88 at #loc84))
|
| 216 |
+
#loc115 = loc(callsite(#loc89 at #loc84))
|
| 217 |
+
#loc116 = loc(callsite(#loc94 at #loc84))
|
| 218 |
+
#loc117 = loc(callsite(#loc96 at #loc84))
|
SpecForge-ext/cache/compiled_kernels/triton/7/25SMJXR2INGZCZI64NAKGLW77JZOIG6LAES6NHHOQFOTKNXS6PHA/triton_red_fused_argmax_1.ttir
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75)
|
| 4 |
+
#loc48 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc49 = loc("out_ptr0"(#loc))
|
| 6 |
+
#loc50 = loc("ks0"(#loc))
|
| 7 |
+
#loc51 = loc("ks1"(#loc))
|
| 8 |
+
#loc52 = loc("xnumel"(#loc))
|
| 9 |
+
#loc53 = loc("r0_numel"(#loc))
|
| 10 |
+
#loc54 = loc(callsite(#loc1 at #loc2))
|
| 11 |
+
module {
|
| 12 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 13 |
+
%true = arith.constant true loc(#loc54)
|
| 14 |
+
%cst = arith.constant dense<true> : tensor<64x4xi1> loc(#loc1)
|
| 15 |
+
%c4_i32 = arith.constant 4 : i32 loc(#loc3)
|
| 16 |
+
%c32000_i32 = arith.constant 32000 : i32 loc(#loc3)
|
| 17 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
|
| 18 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1)
|
| 19 |
+
%cst_1 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc1)
|
| 20 |
+
%cst_2 = arith.constant dense<32000> : tensor<1x4xi32> loc(#loc1)
|
| 21 |
+
%_tmp2_index = arith.constant dense<2147483647> : tensor<64x4xi32> loc(#loc55)
|
| 22 |
+
%_tmp2 = arith.constant dense<0xFF800000> : tensor<64x4xf32> loc(#loc56)
|
| 23 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 24 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc57)
|
| 25 |
+
%xoffset_3 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc58)
|
| 26 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc59)
|
| 27 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc60)
|
| 28 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc61)
|
| 29 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc61)
|
| 30 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc62)
|
| 31 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc62)
|
| 32 |
+
%r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc63)
|
| 33 |
+
%r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc64)
|
| 34 |
+
%x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc65)
|
| 35 |
+
%x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc65)
|
| 36 |
+
%x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc65)
|
| 37 |
+
%x1 = arith.divsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc66)
|
| 38 |
+
%_tmp2_index_11:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c4_i32 iter_args(%_tmp2_12 = %_tmp2, %_tmp2_index_13 = %_tmp2_index) -> (tensor<64x4xf32>, tensor<64x4xi32>) : i32 {
|
| 39 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc68)
|
| 40 |
+
%r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x4xi32> loc(#loc68)
|
| 41 |
+
%r0_mask = arith.cmpi slt, %r0_index_14, %cst_2 : tensor<1x4xi32> loc(#loc69)
|
| 42 |
+
%tmp0 = arith.muli %x0_10, %cst_1 : tensor<64x1xi64> loc(#loc70)
|
| 43 |
+
%tmp0_15 = arith.extsi %r0_index_14 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc71)
|
| 44 |
+
%tmp0_16 = tt.broadcast %tmp0_15 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc71)
|
| 45 |
+
%tmp0_17 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc71)
|
| 46 |
+
%tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<64x4xi64> loc(#loc71)
|
| 47 |
+
%tmp0_19 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc72)
|
| 48 |
+
%tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<64x1xi64> loc(#loc72)
|
| 49 |
+
%tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc73)
|
| 50 |
+
%tmp0_22 = arith.addi %tmp0_18, %tmp0_21 : tensor<64x4xi64> loc(#loc73)
|
| 51 |
+
%tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>> loc(#loc74)
|
| 52 |
+
%tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<64x4x!tt.ptr<f32>>, tensor<64x4xi64> loc(#loc74)
|
| 53 |
+
%tmp0_25 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc75)
|
| 54 |
+
%tmp0_26 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc75)
|
| 55 |
+
%tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<64x4xi1> loc(#loc75)
|
| 56 |
+
%tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<f32>> loc(#loc76)
|
| 57 |
+
%mask = arith.cmpf ogt, %_tmp2_12, %tmp0_28 : tensor<64x4xf32> loc(#loc118)
|
| 58 |
+
%equal = arith.cmpf oeq, %_tmp2_12, %tmp0_28 : tensor<64x4xf32> loc(#loc119)
|
| 59 |
+
%a_isnan = arith.cmpf une, %_tmp2_12, %_tmp2_12 : tensor<64x4xf32> loc(#loc98)
|
| 60 |
+
%b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<64x4xf32> loc(#loc99)
|
| 61 |
+
%mask_29 = arith.xori %b_isnan, %cst : tensor<64x4xi1> loc(#loc100)
|
| 62 |
+
%mask_30 = arith.andi %a_isnan, %mask_29 : tensor<64x4xi1> loc(#loc101)
|
| 63 |
+
%mask_31 = arith.ori %mask, %mask_30 : tensor<64x4xi1> loc(#loc120)
|
| 64 |
+
%equal_32 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1> loc(#loc103)
|
| 65 |
+
%equal_33 = arith.ori %equal, %equal_32 : tensor<64x4xi1> loc(#loc121)
|
| 66 |
+
%mask_34 = tt.broadcast %r0_index_14 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc105)
|
| 67 |
+
%mask_35 = arith.cmpi slt, %_tmp2_index_13, %mask_34 : tensor<64x4xi32> loc(#loc105)
|
| 68 |
+
%mask_36 = arith.andi %equal_33, %mask_35 : tensor<64x4xi1> loc(#loc106)
|
| 69 |
+
%mask_37 = arith.ori %mask_31, %mask_36 : tensor<64x4xi1> loc(#loc107)
|
| 70 |
+
%4 = arith.select %mask_37, %_tmp2_12, %tmp0_28 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc89)
|
| 71 |
+
%5 = arith.select %mask_37, %_tmp2_index_13, %mask_34 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc90)
|
| 72 |
+
%_tmp2_38 = arith.select %tmp0_27, %4, %_tmp2_12 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc91)
|
| 73 |
+
%_tmp2_index_39 = arith.select %tmp0_27, %5, %_tmp2_index_13 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc92)
|
| 74 |
+
scf.yield %_tmp2_38, %_tmp2_index_39 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc42)
|
| 75 |
+
} loc(#loc95)
|
| 76 |
+
%0:2 = "tt.reduce"(%_tmp2_index_11#0, %_tmp2_index_11#1) <{axis = 1 : i32}> ({
|
| 77 |
+
^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))):
|
| 78 |
+
%mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc122)
|
| 79 |
+
%equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc123)
|
| 80 |
+
%a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc108)
|
| 81 |
+
%b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc109)
|
| 82 |
+
%mask_12 = arith.xori %b_isnan, %true : i1 loc(#loc110)
|
| 83 |
+
%mask_13 = arith.andi %a_isnan, %mask_12 : i1 loc(#loc111)
|
| 84 |
+
%mask_14 = arith.ori %mask, %mask_13 : i1 loc(#loc124)
|
| 85 |
+
%equal_15 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc112)
|
| 86 |
+
%equal_16 = arith.ori %equal, %equal_15 : i1 loc(#loc125)
|
| 87 |
+
%mask_17 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc113)
|
| 88 |
+
%mask_18 = arith.andi %equal_16, %mask_17 : i1 loc(#loc114)
|
| 89 |
+
%mask_19 = arith.ori %mask_14, %mask_18 : i1 loc(#loc115)
|
| 90 |
+
%4 = arith.select %mask_19, %arg6, %arg8 : f32 loc(#loc116)
|
| 91 |
+
%5 = arith.select %mask_19, %arg7, %arg9 : i32 loc(#loc117)
|
| 92 |
+
tt.reduce.return %4, %5 : f32, i32 loc(#loc93)
|
| 93 |
+
}) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc93)
|
| 94 |
+
%tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc94)
|
| 95 |
+
%1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<64x1x!tt.ptr<i64>> loc(#loc45)
|
| 96 |
+
%2 = tt.addptr %1, %xindex_6 : tensor<64x1x!tt.ptr<i64>>, tensor<64x1xi32> loc(#loc45)
|
| 97 |
+
%3 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc46)
|
| 98 |
+
tt.store %2, %3, %xmask_7 : tensor<64x1x!tt.ptr<i64>> loc(#loc46)
|
| 99 |
+
tt.return loc(#loc47)
|
| 100 |
+
} loc(#loc)
|
| 101 |
+
} loc(#loc)
|
| 102 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40)
|
| 103 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58)
|
| 104 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55)
|
| 105 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28)
|
| 106 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33)
|
| 107 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36)
|
| 108 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44)
|
| 109 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23)
|
| 110 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21)
|
| 111 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27)
|
| 112 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37)
|
| 113 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19)
|
| 114 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19)
|
| 115 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31)
|
| 116 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29)
|
| 117 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47)
|
| 118 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41)
|
| 119 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56)
|
| 120 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52)
|
| 121 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34)
|
| 122 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71)
|
| 123 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61)
|
| 124 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 125 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38)
|
| 126 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 127 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 128 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 129 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 130 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 131 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 132 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 133 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 134 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 135 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 136 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 137 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 138 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 139 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54)
|
| 140 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66)
|
| 141 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8)
|
| 142 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 143 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20)
|
| 144 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25)
|
| 145 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36)
|
| 146 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4)
|
| 147 |
+
#loc55 = loc("_tmp2_index"(#loc4))
|
| 148 |
+
#loc56 = loc("_tmp2"(#loc5))
|
| 149 |
+
#loc57 = loc("xoffset"(#loc6))
|
| 150 |
+
#loc58 = loc("xoffset"(#loc7))
|
| 151 |
+
#loc59 = loc("xindex"(#loc8))
|
| 152 |
+
#loc60 = loc("xindex"(#loc9))
|
| 153 |
+
#loc61 = loc("xindex"(#loc10))
|
| 154 |
+
#loc62 = loc("xmask"(#loc11))
|
| 155 |
+
#loc63 = loc("r0_base"(#loc12))
|
| 156 |
+
#loc64 = loc("r0_base"(#loc13))
|
| 157 |
+
#loc65 = loc("x0"(#loc14))
|
| 158 |
+
#loc66 = loc("x1"(#loc15))
|
| 159 |
+
#loc67 = loc("_tmp2"(#loc3))
|
| 160 |
+
#loc68 = loc("r0_index"(#loc16))
|
| 161 |
+
#loc69 = loc("r0_mask"(#loc17))
|
| 162 |
+
#loc70 = loc("tmp0"(#loc18))
|
| 163 |
+
#loc71 = loc("tmp0"(#loc19))
|
| 164 |
+
#loc72 = loc("tmp0"(#loc20))
|
| 165 |
+
#loc73 = loc("tmp0"(#loc21))
|
| 166 |
+
#loc74 = loc("tmp0"(#loc22))
|
| 167 |
+
#loc75 = loc("tmp0"(#loc23))
|
| 168 |
+
#loc76 = loc("tmp0"(#loc24))
|
| 169 |
+
#loc77 = loc("mask"(#loc25))
|
| 170 |
+
#loc78 = loc("equal"(#loc27))
|
| 171 |
+
#loc79 = loc("a_isnan"(#loc28))
|
| 172 |
+
#loc80 = loc("b_isnan"(#loc29))
|
| 173 |
+
#loc81 = loc("mask"(#loc30))
|
| 174 |
+
#loc82 = loc("mask"(#loc31))
|
| 175 |
+
#loc83 = loc("mask"(#loc32))
|
| 176 |
+
#loc84 = loc("equal"(#loc33))
|
| 177 |
+
#loc85 = loc("equal"(#loc34))
|
| 178 |
+
#loc86 = loc("mask"(#loc35))
|
| 179 |
+
#loc87 = loc("mask"(#loc36))
|
| 180 |
+
#loc88 = loc("mask"(#loc37))
|
| 181 |
+
#loc89 = loc(callsite(#loc38 at #loc26))
|
| 182 |
+
#loc90 = loc(callsite(#loc39 at #loc26))
|
| 183 |
+
#loc91 = loc("_tmp2"(#loc40))
|
| 184 |
+
#loc92 = loc("_tmp2_index"(#loc41))
|
| 185 |
+
#loc93 = loc(callsite(#loc43 at #loc2))
|
| 186 |
+
#loc94 = loc("tmp2"(#loc44))
|
| 187 |
+
#loc95 = loc("_tmp2_index"(#loc67))
|
| 188 |
+
#loc96 = loc("mask"(#loc77))
|
| 189 |
+
#loc97 = loc("equal"(#loc78))
|
| 190 |
+
#loc98 = loc(callsite(#loc79 at #loc26))
|
| 191 |
+
#loc99 = loc(callsite(#loc80 at #loc26))
|
| 192 |
+
#loc100 = loc(callsite(#loc81 at #loc26))
|
| 193 |
+
#loc101 = loc(callsite(#loc82 at #loc26))
|
| 194 |
+
#loc102 = loc("mask"(#loc83))
|
| 195 |
+
#loc103 = loc(callsite(#loc84 at #loc26))
|
| 196 |
+
#loc104 = loc("equal"(#loc85))
|
| 197 |
+
#loc105 = loc(callsite(#loc86 at #loc26))
|
| 198 |
+
#loc106 = loc(callsite(#loc87 at #loc26))
|
| 199 |
+
#loc107 = loc(callsite(#loc88 at #loc26))
|
| 200 |
+
#loc108 = loc(callsite(#loc79 at #loc93))
|
| 201 |
+
#loc109 = loc(callsite(#loc80 at #loc93))
|
| 202 |
+
#loc110 = loc(callsite(#loc81 at #loc93))
|
| 203 |
+
#loc111 = loc(callsite(#loc82 at #loc93))
|
| 204 |
+
#loc112 = loc(callsite(#loc84 at #loc93))
|
| 205 |
+
#loc113 = loc(callsite(#loc86 at #loc93))
|
| 206 |
+
#loc114 = loc(callsite(#loc87 at #loc93))
|
| 207 |
+
#loc115 = loc(callsite(#loc88 at #loc93))
|
| 208 |
+
#loc116 = loc(callsite(#loc38 at #loc93))
|
| 209 |
+
#loc117 = loc(callsite(#loc39 at #loc93))
|
| 210 |
+
#loc118 = loc(callsite(#loc96 at #loc26))
|
| 211 |
+
#loc119 = loc(callsite(#loc97 at #loc26))
|
| 212 |
+
#loc120 = loc(callsite(#loc102 at #loc26))
|
| 213 |
+
#loc121 = loc(callsite(#loc104 at #loc26))
|
| 214 |
+
#loc122 = loc(callsite(#loc96 at #loc93))
|
| 215 |
+
#loc123 = loc(callsite(#loc97 at #loc93))
|
| 216 |
+
#loc124 = loc(callsite(#loc102 at #loc93))
|
| 217 |
+
#loc125 = loc(callsite(#loc104 at #loc93))
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin
ADDED
|
Binary file (22.3 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "fe376ba41d2f05ff2bb7c13c37b09b85f38cb341d26f28c38925cd0f032bd098", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"}
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@assertFunc_0 = internal constant [8 x i8] c"unknown\00"
|
| 6 |
+
@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py\00"
|
| 7 |
+
@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00"
|
| 8 |
+
|
| 9 |
+
; Function Attrs: noreturn
|
| 10 |
+
declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0
|
| 11 |
+
|
| 12 |
+
define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 {
|
| 13 |
+
%12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10
|
| 14 |
+
%13 = icmp samesign ult i32 %12, 32, !dbg !11
|
| 15 |
+
%14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12
|
| 16 |
+
%15 = and i32 %14, 31, !dbg !12
|
| 17 |
+
%16 = zext nneg i32 %12 to i64, !dbg !13
|
| 18 |
+
%17 = mul i64 %5, %16, !dbg !13
|
| 19 |
+
%18 = icmp sgt i32 %8, 0, !dbg !14
|
| 20 |
+
br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14
|
| 21 |
+
|
| 22 |
+
.lr.ph: ; preds = %11
|
| 23 |
+
%19 = getelementptr i32, ptr addrspace(1) %0, i64 %17
|
| 24 |
+
br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us
|
| 25 |
+
|
| 26 |
+
.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us
|
| 27 |
+
%20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ]
|
| 28 |
+
%21 = or disjoint i32 %20, %15, !dbg !15
|
| 29 |
+
%22 = sext i32 %21 to i64, !dbg !16
|
| 30 |
+
%23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17
|
| 31 |
+
%24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18
|
| 32 |
+
%25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18
|
| 33 |
+
%26 = add i32 %20, 32, !dbg !14
|
| 34 |
+
%27 = icmp slt i32 %26, %8, !dbg !14
|
| 35 |
+
br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14
|
| 36 |
+
|
| 37 |
+
.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split
|
| 38 |
+
%28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ]
|
| 39 |
+
%29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ]
|
| 40 |
+
%30 = or disjoint i32 %29, %15, !dbg !15
|
| 41 |
+
%31 = icmp slt i32 %30, %8, !dbg !19
|
| 42 |
+
%32 = sext i32 %30 to i64, !dbg !16
|
| 43 |
+
%33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17
|
| 44 |
+
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18
|
| 45 |
+
%35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18
|
| 46 |
+
%narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20
|
| 47 |
+
%spec.select = sext i32 %narrow16 to i64, !dbg !20
|
| 48 |
+
%36 = add i64 %28, %spec.select, !dbg !20
|
| 49 |
+
%37 = add i32 %29, 32, !dbg !14
|
| 50 |
+
%38 = icmp slt i32 %37, %8, !dbg !14
|
| 51 |
+
br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14
|
| 52 |
+
|
| 53 |
+
._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11
|
| 54 |
+
%.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21
|
| 55 |
+
%extelt.offset = lshr i64 %.lcssa, 32, !dbg !22
|
| 56 |
+
%39 = trunc nuw i64 %extelt.offset to i32, !dbg !22
|
| 57 |
+
%40 = trunc i64 %.lcssa to i32, !dbg !22
|
| 58 |
+
%41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22
|
| 59 |
+
%42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22
|
| 60 |
+
%43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22
|
| 61 |
+
%44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22
|
| 62 |
+
%45 = bitcast <2 x i32> %44 to i64, !dbg !22
|
| 63 |
+
%46 = add i64 %.lcssa, %45, !dbg !26
|
| 64 |
+
%extelt.offset3 = lshr i64 %46, 32, !dbg !22
|
| 65 |
+
%47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22
|
| 66 |
+
%48 = trunc i64 %46 to i32, !dbg !22
|
| 67 |
+
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22
|
| 68 |
+
%50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22
|
| 69 |
+
%51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22
|
| 70 |
+
%52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22
|
| 71 |
+
%53 = bitcast <2 x i32> %52 to i64, !dbg !22
|
| 72 |
+
%54 = add i64 %46, %53, !dbg !26
|
| 73 |
+
%extelt.offset4 = lshr i64 %54, 32, !dbg !22
|
| 74 |
+
%55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22
|
| 75 |
+
%56 = trunc i64 %54 to i32, !dbg !22
|
| 76 |
+
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22
|
| 77 |
+
%58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22
|
| 78 |
+
%59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22
|
| 79 |
+
%60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22
|
| 80 |
+
%61 = bitcast <2 x i32> %60 to i64, !dbg !22
|
| 81 |
+
%62 = add i64 %54, %61, !dbg !26
|
| 82 |
+
%extelt.offset5 = lshr i64 %62, 32, !dbg !22
|
| 83 |
+
%63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22
|
| 84 |
+
%64 = trunc i64 %62 to i32, !dbg !22
|
| 85 |
+
%65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22
|
| 86 |
+
%66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22
|
| 87 |
+
%67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22
|
| 88 |
+
%68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22
|
| 89 |
+
%69 = bitcast <2 x i32> %68 to i64, !dbg !22
|
| 90 |
+
%70 = add i64 %62, %69, !dbg !26
|
| 91 |
+
%extelt.offset6 = lshr i64 %70, 32, !dbg !22
|
| 92 |
+
%71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22
|
| 93 |
+
%72 = trunc i64 %70 to i32, !dbg !22
|
| 94 |
+
%73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22
|
| 95 |
+
%74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22
|
| 96 |
+
%75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22
|
| 97 |
+
%76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22
|
| 98 |
+
%77 = bitcast <2 x i32> %76 to i64, !dbg !22
|
| 99 |
+
%78 = add i64 %70, %77, !dbg !26
|
| 100 |
+
%79 = trunc i64 %78 to i32, !dbg !27
|
| 101 |
+
%80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28
|
| 102 |
+
%81 = and i32 %14, 32, !dbg !29
|
| 103 |
+
%82 = icmp eq i32 %81, 0, !dbg !29
|
| 104 |
+
%83 = and i32 %14, 63, !dbg !29
|
| 105 |
+
%84 = icmp eq i32 %83, 0, !dbg !29
|
| 106 |
+
%85 = and i1 %13, %84, !dbg !29
|
| 107 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29
|
| 108 |
+
%86 = icmp slt i64 %5, 2, !dbg !30
|
| 109 |
+
%87 = icmp sgt i64 %5, 1, !dbg !31
|
| 110 |
+
%88 = select i1 %87, i64 %5, i64 0, !dbg !32
|
| 111 |
+
%89 = zext i1 %86 to i64, !dbg !33
|
| 112 |
+
%90 = add i64 %88, %89, !dbg !34
|
| 113 |
+
%91 = mul i64 %90, %16, !dbg !35
|
| 114 |
+
%92 = add i64 %5, 1, !dbg !36
|
| 115 |
+
%93 = add i64 %6, 127, !dbg !37
|
| 116 |
+
%94 = sdiv i64 %93, 128, !dbg !38
|
| 117 |
+
%95 = and i64 %93, 127, !dbg !42
|
| 118 |
+
%.not = icmp ne i64 %95, 0, !dbg !42
|
| 119 |
+
%96 = icmp slt i64 %93, 0, !dbg !43
|
| 120 |
+
%narrow = and i1 %96, %.not, !dbg !44
|
| 121 |
+
%97 = sext i1 %narrow to i64, !dbg !44
|
| 122 |
+
%98 = add nsw i64 %94, %97, !dbg !44
|
| 123 |
+
br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45
|
| 124 |
+
|
| 125 |
+
.lr.ph14: ; preds = %._crit_edge, %119
|
| 126 |
+
%99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ]
|
| 127 |
+
%100 = or disjoint i32 %99, %15, !dbg !46
|
| 128 |
+
%101 = icmp slt i32 %100, %8, !dbg !47
|
| 129 |
+
%102 = sext i32 %100 to i64, !dbg !48
|
| 130 |
+
%103 = add i64 %91, %102, !dbg !48
|
| 131 |
+
%104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49
|
| 132 |
+
%105 = and i1 %13, %101, !dbg !50
|
| 133 |
+
%106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51
|
| 134 |
+
%107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51
|
| 135 |
+
%108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51
|
| 136 |
+
%109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51
|
| 137 |
+
%110 = icmp slt i32 %100, %79, !dbg !52
|
| 138 |
+
%sext7 = shl i64 %109, 32, !dbg !53
|
| 139 |
+
%111 = ashr exact i64 %sext7, 32, !dbg !53
|
| 140 |
+
%112 = select i1 %110, i64 %111, i64 %5, !dbg !53
|
| 141 |
+
%113 = icmp slt i64 %112, 0, !dbg !54
|
| 142 |
+
%114 = select i1 %113, i64 %92, i64 0, !dbg !55
|
| 143 |
+
%115 = add i64 %114, %112, !dbg !55
|
| 144 |
+
%116 = icmp slt i64 %115, 0, !dbg !56
|
| 145 |
+
%117 = icmp sgt i64 %115, %98, !dbg !57
|
| 146 |
+
%.not12 = or i1 %116, %117, !dbg !58
|
| 147 |
+
%.not9 = and i1 %105, %.not12, !dbg !59
|
| 148 |
+
br i1 %.not9, label %118, label %119, !dbg !59
|
| 149 |
+
|
| 150 |
+
118: ; preds = %.lr.ph14
|
| 151 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59
|
| 152 |
+
unreachable, !dbg !59
|
| 153 |
+
|
| 154 |
+
119: ; preds = %.lr.ph14
|
| 155 |
+
%sext = shl i64 %107, 32, !dbg !53
|
| 156 |
+
%120 = ashr exact i64 %sext, 32, !dbg !53
|
| 157 |
+
%121 = select i1 %110, i64 %120, i64 %5, !dbg !53
|
| 158 |
+
%122 = icmp slt i64 %121, 0, !dbg !54
|
| 159 |
+
%123 = select i1 %122, i64 %92, i64 0, !dbg !55
|
| 160 |
+
%124 = trunc i64 %109 to i32, !dbg !60
|
| 161 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59
|
| 162 |
+
%125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61
|
| 163 |
+
%126 = and i1 %82, %105, !dbg !62
|
| 164 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62
|
| 165 |
+
%127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63
|
| 166 |
+
%128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63
|
| 167 |
+
%129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63
|
| 168 |
+
%130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63
|
| 169 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64
|
| 170 |
+
%131 = add i32 %99, 32, !dbg !45
|
| 171 |
+
%132 = icmp slt i32 %131, %8, !dbg !45
|
| 172 |
+
br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45
|
| 173 |
+
|
| 174 |
+
._crit_edge15: ; preds = %119, %._crit_edge
|
| 175 |
+
ret void, !dbg !65
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 179 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
| 180 |
+
|
| 181 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 182 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
| 183 |
+
|
| 184 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 185 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
|
| 186 |
+
|
| 187 |
+
; Function Attrs: convergent nocallback nounwind
|
| 188 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
|
| 189 |
+
|
| 190 |
+
attributes #0 = { noreturn }
|
| 191 |
+
attributes #1 = { "nvvm.reqntid"="64" }
|
| 192 |
+
attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 193 |
+
attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 194 |
+
attributes #4 = { convergent nocallback nounwind }
|
| 195 |
+
attributes #5 = { nounwind }
|
| 196 |
+
|
| 197 |
+
!llvm.dbg.cu = !{!0}
|
| 198 |
+
!llvm.module.flags = !{!2, !3}
|
| 199 |
+
!llvm.ident = !{!4}
|
| 200 |
+
|
| 201 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 202 |
+
!1 = !DIFile(filename: "cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge")
|
| 203 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 204 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 205 |
+
!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
| 206 |
+
!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized)
|
| 207 |
+
!6 = !DIFile(filename: "<unknown>", directory: "")
|
| 208 |
+
!7 = !DISubroutineType(cc: DW_CC_normal, types: !8)
|
| 209 |
+
!8 = !{}
|
| 210 |
+
!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 211 |
+
!10 = !DILocation(line: 22, column: 28, scope: !9)
|
| 212 |
+
!11 = !DILocation(line: 24, column: 21, scope: !9)
|
| 213 |
+
!12 = !DILocation(line: 25, column: 37, scope: !9)
|
| 214 |
+
!13 = !DILocation(line: 35, column: 45, scope: !9)
|
| 215 |
+
!14 = !DILocation(line: 29, column: 40, scope: !9)
|
| 216 |
+
!15 = !DILocation(line: 30, column: 31, scope: !9)
|
| 217 |
+
!16 = !DILocation(line: 35, column: 41, scope: !9)
|
| 218 |
+
!17 = !DILocation(line: 35, column: 34, scope: !9)
|
| 219 |
+
!18 = !DILocation(line: 35, column: 50, scope: !9)
|
| 220 |
+
!19 = !DILocation(line: 31, column: 29, scope: !9)
|
| 221 |
+
!20 = !DILocation(line: 39, column: 48, scope: !9)
|
| 222 |
+
!21 = !DILocation(line: 28, column: 43, scope: !9)
|
| 223 |
+
!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25)
|
| 224 |
+
!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0)
|
| 225 |
+
!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
|
| 226 |
+
!25 = !DILocation(line: 40, column: 25, scope: !9)
|
| 227 |
+
!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25)
|
| 228 |
+
!27 = !DILocation(line: 41, column: 19, scope: !9)
|
| 229 |
+
!28 = !DILocation(line: 42, column: 25, scope: !9)
|
| 230 |
+
!29 = !DILocation(line: 42, column: 36, scope: !9)
|
| 231 |
+
!30 = !DILocation(line: 49, column: 60, scope: !9)
|
| 232 |
+
!31 = !DILocation(line: 49, column: 86, scope: !9)
|
| 233 |
+
!32 = !DILocation(line: 49, column: 77, scope: !9)
|
| 234 |
+
!33 = !DILocation(line: 49, scope: !9)
|
| 235 |
+
!34 = !DILocation(line: 49, column: 68, scope: !9)
|
| 236 |
+
!35 = !DILocation(line: 49, column: 45, scope: !9)
|
| 237 |
+
!36 = !DILocation(line: 55, column: 20, scope: !9)
|
| 238 |
+
!37 = !DILocation(line: 59, column: 94, scope: !9)
|
| 239 |
+
!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41)
|
| 240 |
+
!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0)
|
| 241 |
+
!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 242 |
+
!41 = !DILocation(line: 59, column: 100, scope: !9)
|
| 243 |
+
!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41)
|
| 244 |
+
!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41)
|
| 245 |
+
!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41)
|
| 246 |
+
!45 = !DILocation(line: 43, column: 40, scope: !9)
|
| 247 |
+
!46 = !DILocation(line: 44, column: 31, scope: !9)
|
| 248 |
+
!47 = !DILocation(line: 45, column: 29, scope: !9)
|
| 249 |
+
!48 = !DILocation(line: 49, column: 41, scope: !9)
|
| 250 |
+
!49 = !DILocation(line: 49, column: 34, scope: !9)
|
| 251 |
+
!50 = !DILocation(line: 49, column: 103, scope: !9)
|
| 252 |
+
!51 = !DILocation(line: 49, column: 93, scope: !9)
|
| 253 |
+
!52 = !DILocation(line: 52, column: 22, scope: !9)
|
| 254 |
+
!53 = !DILocation(line: 54, column: 37, scope: !9)
|
| 255 |
+
!54 = !DILocation(line: 57, column: 24, scope: !9)
|
| 256 |
+
!55 = !DILocation(line: 58, column: 39, scope: !9)
|
| 257 |
+
!56 = !DILocation(line: 59, column: 32, scope: !9)
|
| 258 |
+
!57 = !DILocation(line: 59, column: 50, scope: !9)
|
| 259 |
+
!58 = !DILocation(line: 59, column: 112, scope: !9)
|
| 260 |
+
!59 = !DILocation(line: 59, column: 130, scope: !9)
|
| 261 |
+
!60 = !DILocation(line: 50, column: 23, scope: !9)
|
| 262 |
+
!61 = !DILocation(line: 61, column: 29, scope: !9)
|
| 263 |
+
!62 = !DILocation(line: 61, column: 94, scope: !9)
|
| 264 |
+
!63 = !DILocation(line: 62, column: 29, scope: !9)
|
| 265 |
+
!64 = !DILocation(line: 62, column: 95, scope: !9)
|
| 266 |
+
!65 = !DILocation(line: 43, column: 4, scope: !9)
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2
|
| 10 |
+
.extern .func __assertfail
|
| 11 |
+
(
|
| 12 |
+
.param .b64 __assertfail_param_0,
|
| 13 |
+
.param .b64 __assertfail_param_1,
|
| 14 |
+
.param .b32 __assertfail_param_2,
|
| 15 |
+
.param .b64 __assertfail_param_3,
|
| 16 |
+
.param .b64 __assertfail_param_4
|
| 17 |
+
)
|
| 18 |
+
.noreturn;
|
| 19 |
+
.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110};
|
| 20 |
+
.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 103, 101, 47, 99, 103, 101, 55, 112, 112, 118, 118, 54, 53, 104, 97, 115, 114, 119, 113, 51, 51, 97, 97, 53, 121, 106, 110, 116, 52, 116, 119, 100, 122, 118, 119, 51, 112, 97, 107, 50, 120, 52, 98, 117, 55, 110, 55, 120, 50, 104, 121, 101, 120, 109, 118, 46, 112, 121};
|
| 21 |
+
.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41};
|
| 22 |
+
// @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2
|
| 23 |
+
.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(
|
| 24 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0,
|
| 25 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1,
|
| 26 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2,
|
| 27 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3,
|
| 28 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4,
|
| 29 |
+
.param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5,
|
| 30 |
+
.param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6,
|
| 31 |
+
.param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7,
|
| 32 |
+
.param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8,
|
| 33 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9,
|
| 34 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10
|
| 35 |
+
)
|
| 36 |
+
.reqntid 64
|
| 37 |
+
{
|
| 38 |
+
.reg .pred %p<32>;
|
| 39 |
+
.reg .b32 %r<53>;
|
| 40 |
+
.reg .b64 %rd<103>;
|
| 41 |
+
.loc 1 18 0 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:18:0
|
| 42 |
+
$L__func_begin0:
|
| 43 |
+
.loc 1 18 0 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:18:0
|
| 44 |
+
|
| 45 |
+
// %bb.0:
|
| 46 |
+
ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8];
|
| 47 |
+
ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5];
|
| 48 |
+
ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2];
|
| 49 |
+
$L__tmp0:
|
| 50 |
+
.loc 1 22 28 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:22:28
|
| 51 |
+
mov.u32 %r13, %ctaid.x;
|
| 52 |
+
.loc 1 25 37 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:25:37
|
| 53 |
+
mov.u32 %r1, %tid.x;
|
| 54 |
+
and.b32 %r2, %r1, 31;
|
| 55 |
+
.loc 1 35 45 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:45
|
| 56 |
+
cvt.u64.u32 %rd1, %r13;
|
| 57 |
+
mul.lo.s64 %rd2, %rd18, %rd1;
|
| 58 |
+
.loc 1 29 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:29:40
|
| 59 |
+
setp.lt.s32 %p2, %r12, 1;
|
| 60 |
+
mov.b64 %rd102, 0;
|
| 61 |
+
cvt.u32.u64 %r49, %rd1;
|
| 62 |
+
shl.b64 %rd100, %rd2, 2;
|
| 63 |
+
@%p2 bra $L__BB0_6;
|
| 64 |
+
// %bb.1: // %.lr.ph
|
| 65 |
+
.loc 1 0 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:40
|
| 66 |
+
ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0];
|
| 67 |
+
.loc 1 24 21 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:24:21
|
| 68 |
+
setp.lt.u32 %p3, %r49, 32;
|
| 69 |
+
add.s64 %rd3, %rd13, %rd100;
|
| 70 |
+
@%p3 bra $L__BB0_4;
|
| 71 |
+
bra.uni $L__BB0_2;
|
| 72 |
+
$L__BB0_4: // %.lr.ph.split.preheader
|
| 73 |
+
.loc 1 0 21 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:21
|
| 74 |
+
mov.b32 %r51, 0;
|
| 75 |
+
mov.b64 %rd102, 0;
|
| 76 |
+
$L__BB0_5: // %.lr.ph.split
|
| 77 |
+
// =>This Inner Loop Header: Depth=1
|
| 78 |
+
.loc 1 31 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:31:29
|
| 79 |
+
add.s32 %r20, %r2, %r51;
|
| 80 |
+
setp.lt.s32 %p6, %r20, %r12;
|
| 81 |
+
.loc 1 35 34 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:34
|
| 82 |
+
mad.wide.s32 %rd28, %r20, 4, %rd3;
|
| 83 |
+
.loc 1 35 50 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:50
|
| 84 |
+
// begin inline asm
|
| 85 |
+
mov.u64 %rd27, 0x0;
|
| 86 |
+
createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0;
|
| 87 |
+
// end inline asm
|
| 88 |
+
// begin inline asm
|
| 89 |
+
mov.u32 %r19, 0x0;
|
| 90 |
+
@%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27;
|
| 91 |
+
// end inline asm
|
| 92 |
+
.loc 1 39 48 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:39:48
|
| 93 |
+
selp.b32 %r21, %r19, 0, %p6;
|
| 94 |
+
cvt.s64.s32 %rd30, %r21;
|
| 95 |
+
add.s64 %rd102, %rd102, %rd30;
|
| 96 |
+
.loc 1 29 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:29:40
|
| 97 |
+
add.s32 %r51, %r51, 32;
|
| 98 |
+
setp.lt.s32 %p7, %r51, %r12;
|
| 99 |
+
@%p7 bra $L__BB0_5;
|
| 100 |
+
bra.uni $L__BB0_6;
|
| 101 |
+
$L__BB0_2: // %.lr.ph.split.us.preheader
|
| 102 |
+
.loc 1 0 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:40
|
| 103 |
+
mov.b32 %r50, 0;
|
| 104 |
+
$L__BB0_3: // %.lr.ph.split.us
|
| 105 |
+
// =>This Inner Loop Header: Depth=1
|
| 106 |
+
.loc 1 35 41 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:41
|
| 107 |
+
add.s32 %r17, %r2, %r50;
|
| 108 |
+
.loc 1 35 34 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:34
|
| 109 |
+
mad.wide.s32 %rd23, %r17, 4, %rd3;
|
| 110 |
+
.loc 1 35 50 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:35:50
|
| 111 |
+
// begin inline asm
|
| 112 |
+
mov.u64 %rd22, 0x0;
|
| 113 |
+
createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0;
|
| 114 |
+
// end inline asm
|
| 115 |
+
mov.pred %p4, 0;
|
| 116 |
+
// begin inline asm
|
| 117 |
+
mov.u32 %r16, 0x0;
|
| 118 |
+
@%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22;
|
| 119 |
+
// end inline asm
|
| 120 |
+
.loc 1 29 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:29:40
|
| 121 |
+
add.s32 %r50, %r50, 32;
|
| 122 |
+
setp.lt.s32 %p5, %r50, %r12;
|
| 123 |
+
@%p5 bra $L__BB0_3;
|
| 124 |
+
$L__BB0_6: // %._crit_edge
|
| 125 |
+
.loc 1 24 21 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:24:21
|
| 126 |
+
setp.lt.u32 %p10, %r49, 32;
|
| 127 |
+
$L__tmp1:
|
| 128 |
+
.loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 129 |
+
mov.b64 {_, %r24}, %rd102;
|
| 130 |
+
cvt.u32.u64 %r25, %rd102;
|
| 131 |
+
shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1;
|
| 132 |
+
shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1;
|
| 133 |
+
cvt.u64.u32 %rd32, %r26;
|
| 134 |
+
cvt.u64.u32 %rd33, %r27;
|
| 135 |
+
shl.b64 %rd34, %rd33, 32;
|
| 136 |
+
or.b64 %rd35, %rd32, %rd34;
|
| 137 |
+
.loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 138 |
+
add.s64 %rd36, %rd102, %rd35;
|
| 139 |
+
.loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 140 |
+
mov.b64 {_, %r28}, %rd36;
|
| 141 |
+
cvt.u32.u64 %r29, %rd36;
|
| 142 |
+
shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1;
|
| 143 |
+
shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1;
|
| 144 |
+
cvt.u64.u32 %rd37, %r30;
|
| 145 |
+
cvt.u64.u32 %rd38, %r31;
|
| 146 |
+
shl.b64 %rd39, %rd38, 32;
|
| 147 |
+
or.b64 %rd40, %rd37, %rd39;
|
| 148 |
+
.loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 149 |
+
add.s64 %rd41, %rd36, %rd40;
|
| 150 |
+
.loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 151 |
+
mov.b64 {_, %r32}, %rd41;
|
| 152 |
+
cvt.u32.u64 %r33, %rd41;
|
| 153 |
+
shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1;
|
| 154 |
+
shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1;
|
| 155 |
+
cvt.u64.u32 %rd42, %r34;
|
| 156 |
+
cvt.u64.u32 %rd43, %r35;
|
| 157 |
+
shl.b64 %rd44, %rd43, 32;
|
| 158 |
+
or.b64 %rd45, %rd42, %rd44;
|
| 159 |
+
.loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 160 |
+
add.s64 %rd46, %rd41, %rd45;
|
| 161 |
+
.loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 162 |
+
mov.b64 {_, %r36}, %rd46;
|
| 163 |
+
cvt.u32.u64 %r37, %rd46;
|
| 164 |
+
shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1;
|
| 165 |
+
shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1;
|
| 166 |
+
cvt.u64.u32 %rd47, %r38;
|
| 167 |
+
cvt.u64.u32 %rd48, %r39;
|
| 168 |
+
shl.b64 %rd49, %rd48, 32;
|
| 169 |
+
or.b64 %rd50, %rd47, %rd49;
|
| 170 |
+
.loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 171 |
+
add.s64 %rd51, %rd46, %rd50;
|
| 172 |
+
.loc 2 291 36 // standard.py:291:36 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 173 |
+
mov.b64 {_, %r40}, %rd51;
|
| 174 |
+
cvt.u32.u64 %r41, %rd51;
|
| 175 |
+
shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
|
| 176 |
+
shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1;
|
| 177 |
+
cvt.u64.u32 %rd52, %r42;
|
| 178 |
+
.loc 2 261 15 // standard.py:261:15 @[ cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:40:25 ]
|
| 179 |
+
add.s64 %rd53, %rd51, %rd52;
|
| 180 |
+
$L__tmp2:
|
| 181 |
+
.loc 1 41 19 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:41:19
|
| 182 |
+
cvt.u32.u64 %r22, %rd53;
|
| 183 |
+
.loc 1 42 25 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:42:25
|
| 184 |
+
shl.b64 %rd54, %rd1, 2;
|
| 185 |
+
add.s64 %rd31, %rd15, %rd54;
|
| 186 |
+
.loc 1 42 36 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:42:36
|
| 187 |
+
and.b32 %r44, %r1, 63;
|
| 188 |
+
setp.eq.b32 %p11, %r44, 0;
|
| 189 |
+
and.pred %p8, %p10, %p11;
|
| 190 |
+
// begin inline asm
|
| 191 |
+
@%p8 st.global.b32 [ %rd31 + 0 ], { %r22 };
|
| 192 |
+
// end inline asm
|
| 193 |
+
.loc 1 43 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:43:40
|
| 194 |
+
@%p2 bra $L__BB0_11;
|
| 195 |
+
// %bb.7: // %.lr.ph14.preheader
|
| 196 |
+
.loc 1 0 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:0:40
|
| 197 |
+
ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6];
|
| 198 |
+
ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4];
|
| 199 |
+
ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3];
|
| 200 |
+
ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1];
|
| 201 |
+
and.b32 %r8, %r1, 32;
|
| 202 |
+
setp.lt.s64 %p12, %rd18, 2;
|
| 203 |
+
setp.gt.s64 %p13, %rd18, 1;
|
| 204 |
+
selp.b64 %rd55, %rd18, 0, %p13;
|
| 205 |
+
selp.b64 %rd56, 1, 0, %p12;
|
| 206 |
+
add.s64 %rd57, %rd55, %rd56;
|
| 207 |
+
mul.lo.s64 %rd7, %rd57, %rd1;
|
| 208 |
+
add.s64 %rd8, %rd18, 1;
|
| 209 |
+
add.s64 %rd58, %rd19, 127;
|
| 210 |
+
shr.s64 %rd59, %rd58, 63;
|
| 211 |
+
shr.u64 %rd60, %rd59, 57;
|
| 212 |
+
add.s64 %rd61, %rd58, %rd60;
|
| 213 |
+
shr.s64 %rd62, %rd61, 7;
|
| 214 |
+
and.b64 %rd63, %rd58, 127;
|
| 215 |
+
setp.ne.b64 %p14, %rd63, 0;
|
| 216 |
+
setp.lt.s64 %p15, %rd58, 0;
|
| 217 |
+
and.pred %p16, %p15, %p14;
|
| 218 |
+
selp.b64 %rd64, -1, 0, %p16;
|
| 219 |
+
add.s64 %rd9, %rd62, %rd64;
|
| 220 |
+
mov.b32 %r52, 0;
|
| 221 |
+
$L__BB0_8: // %.lr.ph14
|
| 222 |
+
// =>This Inner Loop Header: Depth=1
|
| 223 |
+
.loc 1 45 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:45:29
|
| 224 |
+
add.s32 %r10, %r2, %r52;
|
| 225 |
+
setp.lt.s32 %p20, %r10, %r12;
|
| 226 |
+
.loc 1 49 41 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:41
|
| 227 |
+
cvt.s64.s32 %rd73, %r10;
|
| 228 |
+
add.s64 %rd10, %rd7, %rd73;
|
| 229 |
+
.loc 1 49 34 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:34
|
| 230 |
+
shl.b64 %rd74, %rd10, 3;
|
| 231 |
+
add.s64 %rd67, %rd14, %rd74;
|
| 232 |
+
.loc 1 49 103 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:103
|
| 233 |
+
and.pred %p18, %p10, %p20;
|
| 234 |
+
.loc 1 49 93 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:49:93
|
| 235 |
+
// begin inline asm
|
| 236 |
+
mov.u64 %rd65, 0x0;
|
| 237 |
+
createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0;
|
| 238 |
+
// end inline asm
|
| 239 |
+
// begin inline asm
|
| 240 |
+
mov.u64 %rd66, 0x0;
|
| 241 |
+
@%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65;
|
| 242 |
+
// end inline asm
|
| 243 |
+
// begin inline asm
|
| 244 |
+
mov.u64 %rd69, 0x0;
|
| 245 |
+
createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0;
|
| 246 |
+
// end inline asm
|
| 247 |
+
// begin inline asm
|
| 248 |
+
mov.u64 %rd70, 0x0;
|
| 249 |
+
@%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69;
|
| 250 |
+
// end inline asm
|
| 251 |
+
.loc 1 52 22 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:52:22
|
| 252 |
+
setp.lt.s32 %p21, %r10, %r22;
|
| 253 |
+
.loc 1 54 37 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:54:37
|
| 254 |
+
cvt.s64.s32 %rd75, %rd70;
|
| 255 |
+
selp.b64 %rd76, %rd75, %rd18, %p21;
|
| 256 |
+
.loc 1 58 39 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:58:39
|
| 257 |
+
shr.s64 %rd77, %rd76, 63;
|
| 258 |
+
and.b64 %rd78, %rd77, %rd8;
|
| 259 |
+
add.s64 %rd79, %rd78, %rd76;
|
| 260 |
+
.loc 1 59 32 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:32
|
| 261 |
+
setp.lt.s64 %p22, %rd79, 0;
|
| 262 |
+
.loc 1 59 50 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:50
|
| 263 |
+
setp.gt.s64 %p23, %rd79, %rd9;
|
| 264 |
+
.loc 1 59 112 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:112
|
| 265 |
+
or.pred %p24, %p22, %p23;
|
| 266 |
+
.loc 1 59 130 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:130
|
| 267 |
+
and.pred %p25, %p18, %p24;
|
| 268 |
+
not.pred %p26, %p25;
|
| 269 |
+
@%p26 bra $L__BB0_10;
|
| 270 |
+
bra.uni $L__BB0_9;
|
| 271 |
+
$L__BB0_10: // in Loop: Header=BB0_8 Depth=1
|
| 272 |
+
.loc 1 42 36 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:42:36
|
| 273 |
+
setp.eq.b32 %p30, %r8, 0;
|
| 274 |
+
.loc 1 54 37 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:54:37
|
| 275 |
+
cvt.s64.s32 %rd82, %rd66;
|
| 276 |
+
selp.b64 %rd83, %rd82, %rd18, %p21;
|
| 277 |
+
.loc 1 58 39 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:58:39
|
| 278 |
+
shr.s64 %rd84, %rd83, 63;
|
| 279 |
+
and.b64 %rd85, %rd84, %rd8;
|
| 280 |
+
.loc 1 50 23 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:50:23
|
| 281 |
+
cvt.u32.u64 %r47, %rd70;
|
| 282 |
+
.loc 1 59 130 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:130
|
| 283 |
+
bar.sync 0;
|
| 284 |
+
.loc 1 61 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:61:29
|
| 285 |
+
shl.b64 %rd86, %rd10, 2;
|
| 286 |
+
add.s64 %rd80, %rd16, %rd86;
|
| 287 |
+
.loc 1 61 94 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:61:94
|
| 288 |
+
and.pred %p27, %p30, %p18;
|
| 289 |
+
// begin inline asm
|
| 290 |
+
@%p27 st.global.b32 [ %rd80 + 0 ], { %r47 };
|
| 291 |
+
// end inline asm
|
| 292 |
+
.loc 1 62 29 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:62:29
|
| 293 |
+
shl.b64 %rd87, %rd83, 2;
|
| 294 |
+
add.s64 %rd88, %rd17, %rd87;
|
| 295 |
+
shl.b64 %rd89, %rd85, 2;
|
| 296 |
+
add.s64 %rd90, %rd88, %rd89;
|
| 297 |
+
add.s64 %rd92, %rd90, %rd54;
|
| 298 |
+
add.s64 %rd81, %rd92, %rd100;
|
| 299 |
+
mov.b32 %r48, 1;
|
| 300 |
+
.loc 1 62 95 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:62:95
|
| 301 |
+
// begin inline asm
|
| 302 |
+
@%p27 st.global.b32 [ %rd81 + 0 ], { %r48 };
|
| 303 |
+
// end inline asm
|
| 304 |
+
.loc 1 43 40 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:43:40
|
| 305 |
+
add.s32 %r52, %r52, 32;
|
| 306 |
+
setp.lt.s32 %p31, %r52, %r12;
|
| 307 |
+
@%p31 bra $L__BB0_8;
|
| 308 |
+
$L__BB0_11: // %._crit_edge15
|
| 309 |
+
.loc 1 43 4 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:43:4
|
| 310 |
+
ret;
|
| 311 |
+
$L__BB0_9:
|
| 312 |
+
.loc 1 59 130 // cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py:59:130
|
| 313 |
+
{ // callseq 0, 0
|
| 314 |
+
.param .b64 param0;
|
| 315 |
+
.param .b64 param1;
|
| 316 |
+
.param .b32 param2;
|
| 317 |
+
.param .b64 param3;
|
| 318 |
+
.param .b64 param4;
|
| 319 |
+
mov.b64 %rd94, assertFunc_0;
|
| 320 |
+
cvta.global.u64 %rd95, %rd94;
|
| 321 |
+
st.param.b64 [param3], %rd95;
|
| 322 |
+
mov.b64 %rd96, assertFile_0;
|
| 323 |
+
cvta.global.u64 %rd97, %rd96;
|
| 324 |
+
st.param.b64 [param1], %rd97;
|
| 325 |
+
mov.b64 %rd98, assertMessage_0;
|
| 326 |
+
cvta.global.u64 %rd99, %rd98;
|
| 327 |
+
st.param.b64 [param0], %rd99;
|
| 328 |
+
st.param.b64 [param4], 1;
|
| 329 |
+
st.param.b32 [param2], 59;
|
| 330 |
+
call.uni __assertfail, (param0, param1, param2, param3, param4);
|
| 331 |
+
} // callseq 0
|
| 332 |
+
trap;
|
| 333 |
+
$L__tmp3:
|
| 334 |
+
$L__func_end0:
|
| 335 |
+
// -- End function
|
| 336 |
+
}
|
| 337 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py"
|
| 338 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 339 |
+
.section .debug_abbrev
|
| 340 |
+
{
|
| 341 |
+
.b8 1 // Abbreviation Code
|
| 342 |
+
.b8 17 // DW_TAG_compile_unit
|
| 343 |
+
.b8 1 // DW_CHILDREN_yes
|
| 344 |
+
.b8 37 // DW_AT_producer
|
| 345 |
+
.b8 8 // DW_FORM_string
|
| 346 |
+
.b8 19 // DW_AT_language
|
| 347 |
+
.b8 5 // DW_FORM_data2
|
| 348 |
+
.b8 3 // DW_AT_name
|
| 349 |
+
.b8 8 // DW_FORM_string
|
| 350 |
+
.b8 16 // DW_AT_stmt_list
|
| 351 |
+
.b8 6 // DW_FORM_data4
|
| 352 |
+
.b8 27 // DW_AT_comp_dir
|
| 353 |
+
.b8 8 // DW_FORM_string
|
| 354 |
+
.b8 0 // EOM(1)
|
| 355 |
+
.b8 0 // EOM(2)
|
| 356 |
+
.b8 2 // Abbreviation Code
|
| 357 |
+
.b8 46 // DW_TAG_subprogram
|
| 358 |
+
.b8 0 // DW_CHILDREN_no
|
| 359 |
+
.b8 3 // DW_AT_name
|
| 360 |
+
.b8 8 // DW_FORM_string
|
| 361 |
+
.b8 32 // DW_AT_inline
|
| 362 |
+
.b8 11 // DW_FORM_data1
|
| 363 |
+
.b8 0 // EOM(1)
|
| 364 |
+
.b8 0 // EOM(2)
|
| 365 |
+
.b8 3 // Abbreviation Code
|
| 366 |
+
.b8 46 // DW_TAG_subprogram
|
| 367 |
+
.b8 1 // DW_CHILDREN_yes
|
| 368 |
+
.b8 17 // DW_AT_low_pc
|
| 369 |
+
.b8 1 // DW_FORM_addr
|
| 370 |
+
.b8 18 // DW_AT_high_pc
|
| 371 |
+
.b8 1 // DW_FORM_addr
|
| 372 |
+
.b8 49 // DW_AT_abstract_origin
|
| 373 |
+
.b8 19 // DW_FORM_ref4
|
| 374 |
+
.b8 0 // EOM(1)
|
| 375 |
+
.b8 0 // EOM(2)
|
| 376 |
+
.b8 4 // Abbreviation Code
|
| 377 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 378 |
+
.b8 0 // DW_CHILDREN_no
|
| 379 |
+
.b8 49 // DW_AT_abstract_origin
|
| 380 |
+
.b8 19 // DW_FORM_ref4
|
| 381 |
+
.b8 17 // DW_AT_low_pc
|
| 382 |
+
.b8 1 // DW_FORM_addr
|
| 383 |
+
.b8 18 // DW_AT_high_pc
|
| 384 |
+
.b8 1 // DW_FORM_addr
|
| 385 |
+
.b8 88 // DW_AT_call_file
|
| 386 |
+
.b8 11 // DW_FORM_data1
|
| 387 |
+
.b8 89 // DW_AT_call_line
|
| 388 |
+
.b8 11 // DW_FORM_data1
|
| 389 |
+
.b8 87 // DW_AT_call_column
|
| 390 |
+
.b8 11 // DW_FORM_data1
|
| 391 |
+
.b8 0 // EOM(1)
|
| 392 |
+
.b8 0 // EOM(2)
|
| 393 |
+
.b8 0 // EOM(3)
|
| 394 |
+
}
|
| 395 |
+
.section .debug_info
|
| 396 |
+
{
|
| 397 |
+
.b32 281 // Length of Unit
|
| 398 |
+
.b8 2 // DWARF version number
|
| 399 |
+
.b8 0
|
| 400 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 401 |
+
.b8 8 // Address Size (in bytes)
|
| 402 |
+
.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit
|
| 403 |
+
.b8 116 // DW_AT_producer
|
| 404 |
+
.b8 114
|
| 405 |
+
.b8 105
|
| 406 |
+
.b8 116
|
| 407 |
+
.b8 111
|
| 408 |
+
.b8 110
|
| 409 |
+
.b8 0
|
| 410 |
+
.b8 2 // DW_AT_language
|
| 411 |
+
.b8 0
|
| 412 |
+
.b8 99 // DW_AT_name
|
| 413 |
+
.b8 103
|
| 414 |
+
.b8 101
|
| 415 |
+
.b8 55
|
| 416 |
+
.b8 112
|
| 417 |
+
.b8 112
|
| 418 |
+
.b8 118
|
| 419 |
+
.b8 118
|
| 420 |
+
.b8 54
|
| 421 |
+
.b8 53
|
| 422 |
+
.b8 104
|
| 423 |
+
.b8 97
|
| 424 |
+
.b8 115
|
| 425 |
+
.b8 114
|
| 426 |
+
.b8 119
|
| 427 |
+
.b8 113
|
| 428 |
+
.b8 51
|
| 429 |
+
.b8 51
|
| 430 |
+
.b8 97
|
| 431 |
+
.b8 97
|
| 432 |
+
.b8 53
|
| 433 |
+
.b8 121
|
| 434 |
+
.b8 106
|
| 435 |
+
.b8 110
|
| 436 |
+
.b8 116
|
| 437 |
+
.b8 52
|
| 438 |
+
.b8 116
|
| 439 |
+
.b8 119
|
| 440 |
+
.b8 100
|
| 441 |
+
.b8 122
|
| 442 |
+
.b8 118
|
| 443 |
+
.b8 119
|
| 444 |
+
.b8 51
|
| 445 |
+
.b8 112
|
| 446 |
+
.b8 97
|
| 447 |
+
.b8 107
|
| 448 |
+
.b8 50
|
| 449 |
+
.b8 120
|
| 450 |
+
.b8 52
|
| 451 |
+
.b8 98
|
| 452 |
+
.b8 117
|
| 453 |
+
.b8 55
|
| 454 |
+
.b8 110
|
| 455 |
+
.b8 55
|
| 456 |
+
.b8 120
|
| 457 |
+
.b8 50
|
| 458 |
+
.b8 104
|
| 459 |
+
.b8 121
|
| 460 |
+
.b8 101
|
| 461 |
+
.b8 120
|
| 462 |
+
.b8 109
|
| 463 |
+
.b8 118
|
| 464 |
+
.b8 46
|
| 465 |
+
.b8 112
|
| 466 |
+
.b8 121
|
| 467 |
+
.b8 0
|
| 468 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 469 |
+
.b8 47 // DW_AT_comp_dir
|
| 470 |
+
.b8 119
|
| 471 |
+
.b8 111
|
| 472 |
+
.b8 114
|
| 473 |
+
.b8 107
|
| 474 |
+
.b8 115
|
| 475 |
+
.b8 112
|
| 476 |
+
.b8 97
|
| 477 |
+
.b8 99
|
| 478 |
+
.b8 101
|
| 479 |
+
.b8 47
|
| 480 |
+
.b8 104
|
| 481 |
+
.b8 97
|
| 482 |
+
.b8 110
|
| 483 |
+
.b8 114
|
| 484 |
+
.b8 117
|
| 485 |
+
.b8 105
|
| 486 |
+
.b8 47
|
| 487 |
+
.b8 83
|
| 488 |
+
.b8 112
|
| 489 |
+
.b8 101
|
| 490 |
+
.b8 99
|
| 491 |
+
.b8 70
|
| 492 |
+
.b8 111
|
| 493 |
+
.b8 114
|
| 494 |
+
.b8 103
|
| 495 |
+
.b8 101
|
| 496 |
+
.b8 45
|
| 497 |
+
.b8 101
|
| 498 |
+
.b8 120
|
| 499 |
+
.b8 116
|
| 500 |
+
.b8 47
|
| 501 |
+
.b8 99
|
| 502 |
+
.b8 97
|
| 503 |
+
.b8 99
|
| 504 |
+
.b8 104
|
| 505 |
+
.b8 101
|
| 506 |
+
.b8 47
|
| 507 |
+
.b8 99
|
| 508 |
+
.b8 111
|
| 509 |
+
.b8 109
|
| 510 |
+
.b8 112
|
| 511 |
+
.b8 105
|
| 512 |
+
.b8 108
|
| 513 |
+
.b8 101
|
| 514 |
+
.b8 100
|
| 515 |
+
.b8 95
|
| 516 |
+
.b8 107
|
| 517 |
+
.b8 101
|
| 518 |
+
.b8 114
|
| 519 |
+
.b8 110
|
| 520 |
+
.b8 101
|
| 521 |
+
.b8 108
|
| 522 |
+
.b8 115
|
| 523 |
+
.b8 47
|
| 524 |
+
.b8 103
|
| 525 |
+
.b8 101
|
| 526 |
+
.b8 0
|
| 527 |
+
.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram
|
| 528 |
+
.b8 116 // DW_AT_name
|
| 529 |
+
.b8 114
|
| 530 |
+
.b8 105
|
| 531 |
+
.b8 116
|
| 532 |
+
.b8 111
|
| 533 |
+
.b8 110
|
| 534 |
+
.b8 95
|
| 535 |
+
.b8 114
|
| 536 |
+
.b8 101
|
| 537 |
+
.b8 100
|
| 538 |
+
.b8 95
|
| 539 |
+
.b8 102
|
| 540 |
+
.b8 117
|
| 541 |
+
.b8 115
|
| 542 |
+
.b8 101
|
| 543 |
+
.b8 100
|
| 544 |
+
.b8 95
|
| 545 |
+
.b8 95
|
| 546 |
+
.b8 116
|
| 547 |
+
.b8 111
|
| 548 |
+
.b8 95
|
| 549 |
+
.b8 99
|
| 550 |
+
.b8 111
|
| 551 |
+
.b8 112
|
| 552 |
+
.b8 121
|
| 553 |
+
.b8 95
|
| 554 |
+
.b8 97
|
| 555 |
+
.b8 114
|
| 556 |
+
.b8 97
|
| 557 |
+
.b8 110
|
| 558 |
+
.b8 103
|
| 559 |
+
.b8 101
|
| 560 |
+
.b8 95
|
| 561 |
+
.b8 105
|
| 562 |
+
.b8 110
|
| 563 |
+
.b8 100
|
| 564 |
+
.b8 101
|
| 565 |
+
.b8 120
|
| 566 |
+
.b8 95
|
| 567 |
+
.b8 112
|
| 568 |
+
.b8 117
|
| 569 |
+
.b8 116
|
| 570 |
+
.b8 95
|
| 571 |
+
.b8 108
|
| 572 |
+
.b8 116
|
| 573 |
+
.b8 95
|
| 574 |
+
.b8 110
|
| 575 |
+
.b8 101
|
| 576 |
+
.b8 119
|
| 577 |
+
.b8 95
|
| 578 |
+
.b8 122
|
| 579 |
+
.b8 101
|
| 580 |
+
.b8 114
|
| 581 |
+
.b8 111
|
| 582 |
+
.b8 115
|
| 583 |
+
.b8 95
|
| 584 |
+
.b8 115
|
| 585 |
+
.b8 99
|
| 586 |
+
.b8 97
|
| 587 |
+
.b8 108
|
| 588 |
+
.b8 97
|
| 589 |
+
.b8 114
|
| 590 |
+
.b8 95
|
| 591 |
+
.b8 116
|
| 592 |
+
.b8 101
|
| 593 |
+
.b8 110
|
| 594 |
+
.b8 115
|
| 595 |
+
.b8 111
|
| 596 |
+
.b8 114
|
| 597 |
+
.b8 95
|
| 598 |
+
.b8 115
|
| 599 |
+
.b8 117
|
| 600 |
+
.b8 109
|
| 601 |
+
.b8 95
|
| 602 |
+
.b8 117
|
| 603 |
+
.b8 110
|
| 604 |
+
.b8 115
|
| 605 |
+
.b8 113
|
| 606 |
+
.b8 117
|
| 607 |
+
.b8 101
|
| 608 |
+
.b8 101
|
| 609 |
+
.b8 122
|
| 610 |
+
.b8 101
|
| 611 |
+
.b8 95
|
| 612 |
+
.b8 118
|
| 613 |
+
.b8 105
|
| 614 |
+
.b8 101
|
| 615 |
+
.b8 119
|
| 616 |
+
.b8 95
|
| 617 |
+
.b8 119
|
| 618 |
+
.b8 104
|
| 619 |
+
.b8 101
|
| 620 |
+
.b8 114
|
| 621 |
+
.b8 101
|
| 622 |
+
.b8 95
|
| 623 |
+
.b8 50
|
| 624 |
+
.b8 0
|
| 625 |
+
.b8 1 // DW_AT_inline
|
| 626 |
+
.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram
|
| 627 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 628 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 629 |
+
.b32 139 // DW_AT_abstract_origin
|
| 630 |
+
.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine
|
| 631 |
+
.b32 139 // DW_AT_abstract_origin
|
| 632 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 633 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 634 |
+
.b8 1 // DW_AT_call_file
|
| 635 |
+
.b8 40 // DW_AT_call_line
|
| 636 |
+
.b8 25 // DW_AT_call_column
|
| 637 |
+
.b8 0 // End Of Children Mark
|
| 638 |
+
.b8 0 // End Of Children Mark
|
| 639 |
+
}
|
| 640 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":18:0)
|
| 2 |
+
#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
|
| 3 |
+
#loc79 = loc(unknown)
|
| 4 |
+
#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
|
| 5 |
+
#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
|
| 6 |
+
#loc97 = loc("in_ptr0"(#loc))
|
| 7 |
+
#loc98 = loc("in_ptr1"(#loc))
|
| 8 |
+
#loc99 = loc("out_ptr1"(#loc))
|
| 9 |
+
#loc100 = loc("out_ptr2"(#loc))
|
| 10 |
+
#loc101 = loc("out_ptr3"(#loc))
|
| 11 |
+
#loc102 = loc("ks0"(#loc))
|
| 12 |
+
#loc103 = loc("ks1"(#loc))
|
| 13 |
+
#loc104 = loc("xnumel"(#loc))
|
| 14 |
+
#loc105 = loc("r0_numel"(#loc))
|
| 15 |
+
#loc151 = loc("input"(#loc77))
|
| 16 |
+
#loc152 = loc("a"(#loc82))
|
| 17 |
+
#loc153 = loc("b"(#loc82))
|
| 18 |
+
#loc154 = loc("a"(#loc86))
|
| 19 |
+
module {
|
| 20 |
+
tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 21 |
+
%xnumel_0 = arith.constant 32 : i32 loc(#loc106)
|
| 22 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc107)
|
| 23 |
+
%xoffset_1 = arith.constant 1 : i32 loc(#loc108)
|
| 24 |
+
%xoffset_2 = arith.constant 1 : i32 loc(#loc108)
|
| 25 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108)
|
| 26 |
+
%xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109)
|
| 27 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110)
|
| 28 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111)
|
| 29 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111)
|
| 30 |
+
%xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc112)
|
| 31 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112)
|
| 32 |
+
%r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113)
|
| 33 |
+
%r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114)
|
| 34 |
+
%_tmp3 = arith.constant 0 : i64 loc(#loc115)
|
| 35 |
+
%_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115)
|
| 36 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc11)
|
| 37 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc11)
|
| 38 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11)
|
| 39 |
+
%1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11)
|
| 40 |
+
%2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11)
|
| 41 |
+
%3 = ub.poison : i32 loc(#loc11)
|
| 42 |
+
%_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 {
|
| 43 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117)
|
| 44 |
+
%r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117)
|
| 45 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118)
|
| 46 |
+
%r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118)
|
| 47 |
+
%tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119)
|
| 48 |
+
%tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119)
|
| 49 |
+
%tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119)
|
| 50 |
+
%tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120)
|
| 51 |
+
%tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120)
|
| 52 |
+
%tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120)
|
| 53 |
+
%tmp0_22 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>> loc(#loc121)
|
| 54 |
+
%tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr<i32>>, tensor<1x32xi64> loc(#loc121)
|
| 55 |
+
%tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122)
|
| 56 |
+
%tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122)
|
| 57 |
+
%tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123)
|
| 58 |
+
%tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123)
|
| 59 |
+
%tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123)
|
| 60 |
+
%tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i32>> loc(#loc123)
|
| 61 |
+
%tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124)
|
| 62 |
+
%tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125)
|
| 63 |
+
%_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126)
|
| 64 |
+
%_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126)
|
| 65 |
+
%_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127)
|
| 66 |
+
scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23)
|
| 67 |
+
} loc(#loc116)
|
| 68 |
+
%tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128)
|
| 69 |
+
%tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129)
|
| 70 |
+
%tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130)
|
| 71 |
+
%4 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc27)
|
| 72 |
+
%5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc27)
|
| 73 |
+
tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc28)
|
| 74 |
+
%c0_i32_12 = arith.constant 0 : i32 loc(#loc29)
|
| 75 |
+
%c32_i32_13 = arith.constant 32 : i32 loc(#loc29)
|
| 76 |
+
%6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29)
|
| 77 |
+
%7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29)
|
| 78 |
+
%8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29)
|
| 79 |
+
%9 = ub.poison : i32 loc(#loc29)
|
| 80 |
+
scf.for %r0_offset = %6 to %7 step %8 : i32 {
|
| 81 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131)
|
| 82 |
+
%r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131)
|
| 83 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132)
|
| 84 |
+
%r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132)
|
| 85 |
+
%tmp6 = arith.constant 1 : i32 loc(#loc133)
|
| 86 |
+
%tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133)
|
| 87 |
+
%tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133)
|
| 88 |
+
%tmp6_18 = arith.constant 1 : i32 loc(#loc134)
|
| 89 |
+
%tmp6_19 = arith.constant 1 : i32 loc(#loc134)
|
| 90 |
+
%tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134)
|
| 91 |
+
%tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134)
|
| 92 |
+
%tmp6_22 = arith.constant 1 : i32 loc(#loc135)
|
| 93 |
+
%tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135)
|
| 94 |
+
%tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135)
|
| 95 |
+
%tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136)
|
| 96 |
+
%tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136)
|
| 97 |
+
%tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137)
|
| 98 |
+
%tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137)
|
| 99 |
+
%tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138)
|
| 100 |
+
%tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138)
|
| 101 |
+
%tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138)
|
| 102 |
+
%tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139)
|
| 103 |
+
%tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139)
|
| 104 |
+
%tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139)
|
| 105 |
+
%tmp6_35 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1x32x!tt.ptr<i64>> loc(#loc140)
|
| 106 |
+
%tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr<i64>>, tensor<1x32xi64> loc(#loc140)
|
| 107 |
+
%tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141)
|
| 108 |
+
%tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141)
|
| 109 |
+
%tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142)
|
| 110 |
+
%tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142)
|
| 111 |
+
%tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142)
|
| 112 |
+
%tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i64>> loc(#loc142)
|
| 113 |
+
%tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143)
|
| 114 |
+
%tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144)
|
| 115 |
+
%tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144)
|
| 116 |
+
%tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145)
|
| 117 |
+
%tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145)
|
| 118 |
+
%tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145)
|
| 119 |
+
%tmp12 = arith.constant 1 : i32 loc(#loc146)
|
| 120 |
+
%tmp12_46 = arith.constant 1 : i64 loc(#loc146)
|
| 121 |
+
%tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146)
|
| 122 |
+
%tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147)
|
| 123 |
+
%tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147)
|
| 124 |
+
%tmp14 = arith.constant 0 : i32 loc(#loc148)
|
| 125 |
+
%tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148)
|
| 126 |
+
%tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148)
|
| 127 |
+
%tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148)
|
| 128 |
+
%tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149)
|
| 129 |
+
%c0_i32_52 = arith.constant 0 : i32 loc(#loc49)
|
| 130 |
+
%10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49)
|
| 131 |
+
%11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49)
|
| 132 |
+
%12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49)
|
| 133 |
+
%c127_i32 = arith.constant 127 : i32 loc(#loc50)
|
| 134 |
+
%c127_i64 = arith.constant 127 : i64 loc(#loc50)
|
| 135 |
+
%13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50)
|
| 136 |
+
%14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51)
|
| 137 |
+
%c1_i32 = arith.constant 1 : i32 loc(#loc52)
|
| 138 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc52)
|
| 139 |
+
%15 = arith.addi %c1_i64, %14 : i64 loc(#loc52)
|
| 140 |
+
%16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53)
|
| 141 |
+
%17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53)
|
| 142 |
+
%18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54)
|
| 143 |
+
%19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55)
|
| 144 |
+
%20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55)
|
| 145 |
+
%true = arith.constant true loc(#loc56)
|
| 146 |
+
%cst = arith.constant dense<true> : tensor<1x32xi1> loc(#loc56)
|
| 147 |
+
%21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56)
|
| 148 |
+
%22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57)
|
| 149 |
+
tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58)
|
| 150 |
+
%tmp17 = arith.constant 1 : i32 loc(#loc150)
|
| 151 |
+
%tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150)
|
| 152 |
+
%c1_i32_54 = arith.constant 1 : i32 loc(#loc60)
|
| 153 |
+
%23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60)
|
| 154 |
+
%24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60)
|
| 155 |
+
%c1_i32_55 = arith.constant 1 : i32 loc(#loc61)
|
| 156 |
+
%c1_i32_56 = arith.constant 1 : i32 loc(#loc61)
|
| 157 |
+
%25 = arith.extui %24 : i1 to i32 loc(#loc61)
|
| 158 |
+
%26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61)
|
| 159 |
+
%c1_i32_57 = arith.constant 1 : i32 loc(#loc62)
|
| 160 |
+
%27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62)
|
| 161 |
+
%28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62)
|
| 162 |
+
%29 = arith.extui %28 : i1 to i64 loc(#loc63)
|
| 163 |
+
%30 = arith.muli %ks0, %29 : i64 loc(#loc63)
|
| 164 |
+
%31 = arith.extsi %26 : i32 to i64 loc(#loc64)
|
| 165 |
+
%32 = arith.addi %31, %30 : i64 loc(#loc64)
|
| 166 |
+
%33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65)
|
| 167 |
+
%34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65)
|
| 168 |
+
%35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65)
|
| 169 |
+
%36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66)
|
| 170 |
+
%37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66)
|
| 171 |
+
%38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66)
|
| 172 |
+
%39 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>> loc(#loc67)
|
| 173 |
+
%40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr<i32>>, tensor<1x32xi64> loc(#loc67)
|
| 174 |
+
%41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68)
|
| 175 |
+
%42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68)
|
| 176 |
+
tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr<i32>> loc(#loc69)
|
| 177 |
+
%43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70)
|
| 178 |
+
%44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70)
|
| 179 |
+
%45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70)
|
| 180 |
+
%46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71)
|
| 181 |
+
%47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71)
|
| 182 |
+
%48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71)
|
| 183 |
+
%49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72)
|
| 184 |
+
%50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72)
|
| 185 |
+
%51 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>> loc(#loc73)
|
| 186 |
+
%52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr<i32>>, tensor<1x32xi64> loc(#loc73)
|
| 187 |
+
%53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74)
|
| 188 |
+
%54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74)
|
| 189 |
+
%cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75)
|
| 190 |
+
tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr<i32>> loc(#loc75)
|
| 191 |
+
} loc(#loc29)
|
| 192 |
+
tt.return loc(#loc76)
|
| 193 |
+
} loc(#loc)
|
| 194 |
+
tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} {
|
| 195 |
+
%0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 196 |
+
^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)):
|
| 197 |
+
%2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78)
|
| 198 |
+
tt.reduce.return %2 : i64 loc(#loc78)
|
| 199 |
+
}) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78)
|
| 200 |
+
tt.return %0 : tensor<1xi64> loc(#loc80)
|
| 201 |
+
^bb1: // no predecessors
|
| 202 |
+
%1 = ub.poison : tensor<1xi64> loc(#loc81)
|
| 203 |
+
tt.return %1 : tensor<1xi64> loc(#loc81)
|
| 204 |
+
} loc(#loc77)
|
| 205 |
+
tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} {
|
| 206 |
+
%0 = arith.addi %a, %b : i64 loc(#loc83)
|
| 207 |
+
tt.return %0 : i64 loc(#loc84)
|
| 208 |
+
^bb1: // no predecessors
|
| 209 |
+
%1 = ub.poison : i64 loc(#loc85)
|
| 210 |
+
tt.return %1 : i64 loc(#loc85)
|
| 211 |
+
} loc(#loc82)
|
| 212 |
+
tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} {
|
| 213 |
+
%quot = arith.constant 128 : i32 loc(#loc155)
|
| 214 |
+
%quot_0 = arith.constant 128 : i64 loc(#loc155)
|
| 215 |
+
%quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155)
|
| 216 |
+
%remainder = arith.constant 128 : i32 loc(#loc156)
|
| 217 |
+
%remainder_2 = arith.constant 128 : i64 loc(#loc156)
|
| 218 |
+
%remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156)
|
| 219 |
+
%fixed = arith.constant 0 : i32 loc(#loc157)
|
| 220 |
+
%fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157)
|
| 221 |
+
%fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157)
|
| 222 |
+
%fixed_6 = arith.constant 1 : i32 loc(#loc158)
|
| 223 |
+
%fixed_7 = arith.constant 1 : i64 loc(#loc158)
|
| 224 |
+
%fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158)
|
| 225 |
+
%fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159)
|
| 226 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc92)
|
| 227 |
+
%0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92)
|
| 228 |
+
%1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92)
|
| 229 |
+
%false = arith.constant false loc(#loc93)
|
| 230 |
+
%2 = arith.cmpi ne, %1, %false : i1 loc(#loc93)
|
| 231 |
+
%3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94)
|
| 232 |
+
tt.return %3 : i64 loc(#loc95)
|
| 233 |
+
^bb1: // no predecessors
|
| 234 |
+
%4 = ub.poison : i64 loc(#loc96)
|
| 235 |
+
tt.return %4 : i64 loc(#loc96)
|
| 236 |
+
} loc(#loc86)
|
| 237 |
+
} loc(#loc)
|
| 238 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":19:13)
|
| 239 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:28)
|
| 240 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:33)
|
| 241 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":23:36)
|
| 242 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":23:44)
|
| 243 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":23:23)
|
| 244 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":24:21)
|
| 245 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:27)
|
| 246 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:37)
|
| 247 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":28:43)
|
| 248 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":29:40)
|
| 249 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":30:31)
|
| 250 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":31:29)
|
| 251 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:45)
|
| 252 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:41)
|
| 253 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:34)
|
| 254 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:60)
|
| 255 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:50)
|
| 256 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":36:23)
|
| 257 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":38:23)
|
| 258 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:35)
|
| 259 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:48)
|
| 260 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:8)
|
| 261 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:25)
|
| 262 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:28)
|
| 263 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":41:19)
|
| 264 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:25)
|
| 265 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:36)
|
| 266 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:40)
|
| 267 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":44:31)
|
| 268 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":45:29)
|
| 269 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:60)
|
| 270 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:52)
|
| 271 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:86)
|
| 272 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:77)
|
| 273 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:68)
|
| 274 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:45)
|
| 275 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:41)
|
| 276 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:34)
|
| 277 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:103)
|
| 278 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:93)
|
| 279 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":50:23)
|
| 280 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":52:22)
|
| 281 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":54:37)
|
| 282 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":55:20)
|
| 283 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":56:24)
|
| 284 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":57:24)
|
| 285 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":58:39)
|
| 286 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:32)
|
| 287 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:94)
|
| 288 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:100)
|
| 289 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:55)
|
| 290 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:50)
|
| 291 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:42)
|
| 292 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:122)
|
| 293 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:112)
|
| 294 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:110)
|
| 295 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:130)
|
| 296 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":60:35)
|
| 297 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:55)
|
| 298 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:47)
|
| 299 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:81)
|
| 300 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:72)
|
| 301 |
+
#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:63)
|
| 302 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:40)
|
| 303 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:36)
|
| 304 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:29)
|
| 305 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:104)
|
| 306 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:94)
|
| 307 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:53)
|
| 308 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:62)
|
| 309 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:58)
|
| 310 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:29)
|
| 311 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:105)
|
| 312 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:95)
|
| 313 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:4)
|
| 314 |
+
#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 315 |
+
#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
|
| 316 |
+
#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
|
| 317 |
+
#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 318 |
+
#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
|
| 319 |
+
#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
|
| 320 |
+
#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 321 |
+
#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
|
| 322 |
+
#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 323 |
+
#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 324 |
+
#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 325 |
+
#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 326 |
+
#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 327 |
+
#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 328 |
+
#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
|
| 329 |
+
#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
|
| 330 |
+
#loc106 = loc("xnumel"(#loc1))
|
| 331 |
+
#loc107 = loc("xoffset"(#loc2))
|
| 332 |
+
#loc108 = loc("xoffset"(#loc3))
|
| 333 |
+
#loc109 = loc("xindex"(#loc4))
|
| 334 |
+
#loc110 = loc("xindex"(#loc5))
|
| 335 |
+
#loc111 = loc("xindex"(#loc6))
|
| 336 |
+
#loc112 = loc("xmask"(#loc7))
|
| 337 |
+
#loc113 = loc("r0_base"(#loc8))
|
| 338 |
+
#loc114 = loc("r0_base"(#loc9))
|
| 339 |
+
#loc115 = loc("_tmp3"(#loc10))
|
| 340 |
+
#loc116 = loc("_tmp3"(#loc11))
|
| 341 |
+
#loc117 = loc("r0_index"(#loc12))
|
| 342 |
+
#loc118 = loc("r0_mask"(#loc13))
|
| 343 |
+
#loc119 = loc("tmp0"(#loc14))
|
| 344 |
+
#loc120 = loc("tmp0"(#loc15))
|
| 345 |
+
#loc121 = loc("tmp0"(#loc16))
|
| 346 |
+
#loc122 = loc("tmp0"(#loc17))
|
| 347 |
+
#loc123 = loc("tmp0"(#loc18))
|
| 348 |
+
#loc124 = loc("tmp1"(#loc19))
|
| 349 |
+
#loc125 = loc("tmp4"(#loc20))
|
| 350 |
+
#loc126 = loc("_tmp3"(#loc21))
|
| 351 |
+
#loc127 = loc("_tmp3"(#loc22))
|
| 352 |
+
#loc128 = loc("tmp3"(#loc24))
|
| 353 |
+
#loc129 = loc("tmp3"(#loc25))
|
| 354 |
+
#loc130 = loc("tmp5"(#loc26))
|
| 355 |
+
#loc131 = loc("r0_index"(#loc30))
|
| 356 |
+
#loc132 = loc("r0_mask"(#loc31))
|
| 357 |
+
#loc133 = loc("tmp6"(#loc32))
|
| 358 |
+
#loc134 = loc("tmp6"(#loc33))
|
| 359 |
+
#loc135 = loc("tmp6"(#loc34))
|
| 360 |
+
#loc136 = loc("tmp6"(#loc35))
|
| 361 |
+
#loc137 = loc("tmp6"(#loc36))
|
| 362 |
+
#loc138 = loc("tmp6"(#loc37))
|
| 363 |
+
#loc139 = loc("tmp6"(#loc38))
|
| 364 |
+
#loc140 = loc("tmp6"(#loc39))
|
| 365 |
+
#loc141 = loc("tmp6"(#loc40))
|
| 366 |
+
#loc142 = loc("tmp6"(#loc41))
|
| 367 |
+
#loc143 = loc("tmp7"(#loc42))
|
| 368 |
+
#loc144 = loc("tmp9"(#loc43))
|
| 369 |
+
#loc145 = loc("tmp11"(#loc44))
|
| 370 |
+
#loc146 = loc("tmp12"(#loc45))
|
| 371 |
+
#loc147 = loc("tmp13"(#loc46))
|
| 372 |
+
#loc148 = loc("tmp14"(#loc47))
|
| 373 |
+
#loc149 = loc("tmp15"(#loc48))
|
| 374 |
+
#loc150 = loc("tmp17"(#loc59))
|
| 375 |
+
#loc155 = loc("quot"(#loc87))
|
| 376 |
+
#loc156 = loc("remainder"(#loc88))
|
| 377 |
+
#loc157 = loc("fixed"(#loc89))
|
| 378 |
+
#loc158 = loc("fixed"(#loc90))
|
| 379 |
+
#loc159 = loc("fixed"(#loc91))
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:25)
|
| 6 |
+
#loc68 = loc("in_ptr0"(#loc))
|
| 7 |
+
#loc69 = loc("in_ptr1"(#loc))
|
| 8 |
+
#loc70 = loc("out_ptr1"(#loc))
|
| 9 |
+
#loc71 = loc("out_ptr2"(#loc))
|
| 10 |
+
#loc72 = loc("out_ptr3"(#loc))
|
| 11 |
+
#loc73 = loc("ks0"(#loc))
|
| 12 |
+
#loc74 = loc("ks1"(#loc))
|
| 13 |
+
#loc75 = loc("xnumel"(#loc))
|
| 14 |
+
#loc76 = loc("r0_numel"(#loc))
|
| 15 |
+
#loc91 = loc("tmp3"(#loc18))
|
| 16 |
+
#loc124 = loc(callsite(#loc1 at #loc91))
|
| 17 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 18 |
+
tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 19 |
+
%cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1)
|
| 20 |
+
%cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1)
|
| 21 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc1)
|
| 22 |
+
%c127_i64 = arith.constant 127 : i64 loc(#loc1)
|
| 23 |
+
%cst_1 = arith.constant dense<true> : tensor<1x32xi1, #blocked1> loc(#loc1)
|
| 24 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 25 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 26 |
+
%cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1)
|
| 27 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 28 |
+
%c128_i64 = arith.constant 128 : i64 loc(#loc1)
|
| 29 |
+
%cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1)
|
| 30 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc77)
|
| 31 |
+
%xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc78)
|
| 32 |
+
%r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79)
|
| 33 |
+
%r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79)
|
| 34 |
+
%r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79)
|
| 35 |
+
%r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79)
|
| 36 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80)
|
| 37 |
+
%tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81)
|
| 38 |
+
%tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81)
|
| 39 |
+
%tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121)
|
| 40 |
+
%tmp0_9 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>, #blocked1> loc(#loc83)
|
| 41 |
+
%tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122)
|
| 42 |
+
%_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 {
|
| 43 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86)
|
| 44 |
+
%r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86)
|
| 45 |
+
%r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80)
|
| 46 |
+
%tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82)
|
| 47 |
+
%tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82)
|
| 48 |
+
%tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr<i32>, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83)
|
| 49 |
+
%tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84)
|
| 50 |
+
%tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i32>, #blocked1> loc(#loc87)
|
| 51 |
+
%tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88)
|
| 52 |
+
%tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89)
|
| 53 |
+
%_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90)
|
| 54 |
+
scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16)
|
| 55 |
+
} loc(#loc85)
|
| 56 |
+
%tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({
|
| 57 |
+
^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))):
|
| 58 |
+
%tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133)
|
| 59 |
+
tt.reduce.return %tmp3_33 : i64 loc(#loc123)
|
| 60 |
+
}) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123)
|
| 61 |
+
%0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20)
|
| 62 |
+
%tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92)
|
| 63 |
+
%tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92)
|
| 64 |
+
%tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93)
|
| 65 |
+
%tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93)
|
| 66 |
+
%1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc23)
|
| 67 |
+
%2 = tt.splat %1 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc24)
|
| 68 |
+
%3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24)
|
| 69 |
+
tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc24)
|
| 70 |
+
%r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94)
|
| 71 |
+
%tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95)
|
| 72 |
+
%tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96)
|
| 73 |
+
%tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97)
|
| 74 |
+
%tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97)
|
| 75 |
+
%tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125)
|
| 76 |
+
%tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98)
|
| 77 |
+
%tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100)
|
| 78 |
+
%tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126)
|
| 79 |
+
%tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126)
|
| 80 |
+
%tmp6_23 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1x32x!tt.ptr<i64>, #blocked> loc(#loc102)
|
| 81 |
+
%tmp6_24 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1x32x!tt.ptr<i64>, #blocked1> loc(#loc102)
|
| 82 |
+
%tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127)
|
| 83 |
+
%tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104)
|
| 84 |
+
%tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104)
|
| 85 |
+
%tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105)
|
| 86 |
+
%tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105)
|
| 87 |
+
%tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106)
|
| 88 |
+
%tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107)
|
| 89 |
+
%tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107)
|
| 90 |
+
%4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39)
|
| 91 |
+
%quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128)
|
| 92 |
+
%remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129)
|
| 93 |
+
%fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130)
|
| 94 |
+
%fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131)
|
| 95 |
+
%fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132)
|
| 96 |
+
%5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113)
|
| 97 |
+
%6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114)
|
| 98 |
+
%7 = arith.addi %6, %c1_i64 : i64 loc(#loc48)
|
| 99 |
+
%8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49)
|
| 100 |
+
%9 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>, #blocked1> loc(#loc50)
|
| 101 |
+
%10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51)
|
| 102 |
+
%11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115)
|
| 103 |
+
%12 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>, #blocked> loc(#loc54)
|
| 104 |
+
scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 {
|
| 105 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116)
|
| 106 |
+
%r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116)
|
| 107 |
+
%r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116)
|
| 108 |
+
%r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116)
|
| 109 |
+
%r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94)
|
| 110 |
+
%r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94)
|
| 111 |
+
%tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101)
|
| 112 |
+
%tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101)
|
| 113 |
+
%tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101)
|
| 114 |
+
%tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101)
|
| 115 |
+
%tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr<i64>, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102)
|
| 116 |
+
%tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr<i64>, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102)
|
| 117 |
+
%tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103)
|
| 118 |
+
%tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103)
|
| 119 |
+
%tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i64>, #blocked> loc(#loc117)
|
| 120 |
+
%tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i64>, #blocked1> loc(#loc117)
|
| 121 |
+
%tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118)
|
| 122 |
+
%tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118)
|
| 123 |
+
%tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104)
|
| 124 |
+
%tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104)
|
| 125 |
+
%tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105)
|
| 126 |
+
%tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105)
|
| 127 |
+
%tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105)
|
| 128 |
+
%tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105)
|
| 129 |
+
%tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107)
|
| 130 |
+
%tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107)
|
| 131 |
+
%tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119)
|
| 132 |
+
%tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119)
|
| 133 |
+
%tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120)
|
| 134 |
+
%tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120)
|
| 135 |
+
%13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61)
|
| 136 |
+
%14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49)
|
| 137 |
+
%15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62)
|
| 138 |
+
%16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63)
|
| 139 |
+
%17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64)
|
| 140 |
+
tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65)
|
| 141 |
+
%18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr<i32>, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50)
|
| 142 |
+
tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr<i32>, #blocked1> loc(#loc66)
|
| 143 |
+
%19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51)
|
| 144 |
+
%20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52)
|
| 145 |
+
%21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr<i32>, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54)
|
| 146 |
+
tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr<i32>, #blocked> loc(#loc20)
|
| 147 |
+
} loc(#loc55)
|
| 148 |
+
tt.return loc(#loc67)
|
| 149 |
+
} loc(#loc)
|
| 150 |
+
} loc(#loc)
|
| 151 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:28)
|
| 152 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":24:21)
|
| 153 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:37)
|
| 154 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":31:29)
|
| 155 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:45)
|
| 156 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:41)
|
| 157 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:34)
|
| 158 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:60)
|
| 159 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":29:40)
|
| 160 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":30:31)
|
| 161 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:50)
|
| 162 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":36:23)
|
| 163 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":38:23)
|
| 164 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:48)
|
| 165 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:8)
|
| 166 |
+
#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 167 |
+
#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 168 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:95)
|
| 169 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:28)
|
| 170 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":41:19)
|
| 171 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:25)
|
| 172 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:36)
|
| 173 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":45:29)
|
| 174 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:60)
|
| 175 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:86)
|
| 176 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:77)
|
| 177 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:68)
|
| 178 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:52)
|
| 179 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:45)
|
| 180 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:41)
|
| 181 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:34)
|
| 182 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:103)
|
| 183 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":52:22)
|
| 184 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":54:37)
|
| 185 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":55:20)
|
| 186 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":56:24)
|
| 187 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:94)
|
| 188 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 189 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:100)
|
| 190 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
|
| 191 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 192 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 193 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 194 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 195 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 196 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:55)
|
| 197 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:50)
|
| 198 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:29)
|
| 199 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:53)
|
| 200 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:58)
|
| 201 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:62)
|
| 202 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:29)
|
| 203 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:40)
|
| 204 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":44:31)
|
| 205 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:93)
|
| 206 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":50:23)
|
| 207 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":57:24)
|
| 208 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":58:39)
|
| 209 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:32)
|
| 210 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:42)
|
| 211 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:112)
|
| 212 |
+
#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:110)
|
| 213 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:130)
|
| 214 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:94)
|
| 215 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:4)
|
| 216 |
+
#loc77 = loc("xoffset"(#loc2))
|
| 217 |
+
#loc78 = loc("xmask"(#loc3))
|
| 218 |
+
#loc79 = loc("r0_base"(#loc4))
|
| 219 |
+
#loc80 = loc("r0_mask"(#loc5))
|
| 220 |
+
#loc81 = loc("tmp0"(#loc6))
|
| 221 |
+
#loc82 = loc("tmp0"(#loc7))
|
| 222 |
+
#loc83 = loc("tmp0"(#loc8))
|
| 223 |
+
#loc84 = loc("tmp0"(#loc9))
|
| 224 |
+
#loc85 = loc("_tmp3"(#loc10))
|
| 225 |
+
#loc86 = loc("r0_index"(#loc11))
|
| 226 |
+
#loc87 = loc("tmp0"(#loc12))
|
| 227 |
+
#loc88 = loc("tmp1"(#loc13))
|
| 228 |
+
#loc89 = loc("tmp4"(#loc14))
|
| 229 |
+
#loc90 = loc("_tmp3"(#loc15))
|
| 230 |
+
#loc92 = loc("tmp3"(#loc21))
|
| 231 |
+
#loc93 = loc("tmp5"(#loc22))
|
| 232 |
+
#loc94 = loc("r0_mask"(#loc25))
|
| 233 |
+
#loc95 = loc("tmp6"(#loc26))
|
| 234 |
+
#loc96 = loc("tmp6"(#loc27))
|
| 235 |
+
#loc97 = loc("tmp6"(#loc28))
|
| 236 |
+
#loc98 = loc("tmp6"(#loc29))
|
| 237 |
+
#loc99 = loc("tmp6"(#loc30))
|
| 238 |
+
#loc100 = loc("tmp6"(#loc31))
|
| 239 |
+
#loc101 = loc("tmp6"(#loc32))
|
| 240 |
+
#loc102 = loc("tmp6"(#loc33))
|
| 241 |
+
#loc103 = loc("tmp6"(#loc34))
|
| 242 |
+
#loc104 = loc("tmp9"(#loc35))
|
| 243 |
+
#loc105 = loc("tmp11"(#loc36))
|
| 244 |
+
#loc106 = loc("tmp12"(#loc37))
|
| 245 |
+
#loc107 = loc("tmp13"(#loc38))
|
| 246 |
+
#loc108 = loc("quot"(#loc40))
|
| 247 |
+
#loc109 = loc("remainder"(#loc42))
|
| 248 |
+
#loc110 = loc("fixed"(#loc43))
|
| 249 |
+
#loc111 = loc("fixed"(#loc44))
|
| 250 |
+
#loc112 = loc("fixed"(#loc45))
|
| 251 |
+
#loc113 = loc(callsite(#loc46 at #loc41))
|
| 252 |
+
#loc114 = loc(callsite(#loc47 at #loc41))
|
| 253 |
+
#loc115 = loc(fused[#loc52, #loc53])
|
| 254 |
+
#loc116 = loc("r0_index"(#loc56))
|
| 255 |
+
#loc117 = loc("tmp6"(#loc57))
|
| 256 |
+
#loc118 = loc("tmp7"(#loc58))
|
| 257 |
+
#loc119 = loc("tmp14"(#loc59))
|
| 258 |
+
#loc120 = loc("tmp15"(#loc60))
|
| 259 |
+
#loc121 = loc(fused[#loc82, #loc81])
|
| 260 |
+
#loc122 = loc(fused[#loc84, #loc78])
|
| 261 |
+
#loc123 = loc(callsite(#loc17 at #loc91))
|
| 262 |
+
#loc125 = loc(fused[#loc98, #loc99])
|
| 263 |
+
#loc126 = loc(fused[#loc101, #loc100])
|
| 264 |
+
#loc127 = loc(fused[#loc103, #loc78])
|
| 265 |
+
#loc128 = loc(callsite(#loc108 at #loc41))
|
| 266 |
+
#loc129 = loc(callsite(#loc109 at #loc41))
|
| 267 |
+
#loc130 = loc(callsite(#loc110 at #loc41))
|
| 268 |
+
#loc131 = loc(callsite(#loc111 at #loc41))
|
| 269 |
+
#loc132 = loc(callsite(#loc112 at #loc41))
|
| 270 |
+
#loc133 = loc(callsite(#loc19 at #loc123))
|
SpecForge-ext/cache/compiled_kernels/triton/7/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:25)
|
| 4 |
+
#loc69 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc70 = loc("in_ptr1"(#loc))
|
| 6 |
+
#loc71 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc72 = loc("out_ptr2"(#loc))
|
| 8 |
+
#loc73 = loc("out_ptr3"(#loc))
|
| 9 |
+
#loc74 = loc("ks0"(#loc))
|
| 10 |
+
#loc75 = loc("ks1"(#loc))
|
| 11 |
+
#loc76 = loc("xnumel"(#loc))
|
| 12 |
+
#loc77 = loc("r0_numel"(#loc))
|
| 13 |
+
#loc93 = loc("tmp3"(#loc19))
|
| 14 |
+
#loc126 = loc(callsite(#loc1 at #loc93))
|
| 15 |
+
module {
|
| 16 |
+
tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 17 |
+
%c128_i64 = arith.constant 128 : i64 loc(#loc1)
|
| 18 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 19 |
+
%cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc1)
|
| 20 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 21 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 22 |
+
%cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc1)
|
| 23 |
+
%cst_1 = arith.constant dense<true> : tensor<1x32xi1> loc(#loc1)
|
| 24 |
+
%c127_i64 = arith.constant 127 : i64 loc(#loc1)
|
| 25 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc1)
|
| 26 |
+
%cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc1)
|
| 27 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc78)
|
| 28 |
+
%xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79)
|
| 29 |
+
%xmask_3 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc79)
|
| 30 |
+
%r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80)
|
| 31 |
+
%r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81)
|
| 32 |
+
%_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_6 = %cst_2) -> (tensor<1x32xi64>) : i32 {
|
| 33 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83)
|
| 34 |
+
%r0_index_7 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc83)
|
| 35 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84)
|
| 36 |
+
%r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc84)
|
| 37 |
+
%tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85)
|
| 38 |
+
%tmp0_9 = arith.muli %ks0, %tmp0 : i64 loc(#loc85)
|
| 39 |
+
%tmp0_10 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86)
|
| 40 |
+
%tmp0_11 = tt.splat %tmp0_9 : i64 -> tensor<1x32xi64> loc(#loc123)
|
| 41 |
+
%tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x32xi64> loc(#loc86)
|
| 42 |
+
%tmp0_13 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>> loc(#loc87)
|
| 43 |
+
%tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<1x32x!tt.ptr<i32>>, tensor<1x32xi64> loc(#loc87)
|
| 44 |
+
%tmp0_15 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc124)
|
| 45 |
+
%tmp0_16 = arith.andi %r0_mask_8, %tmp0_15 : tensor<1x32xi1> loc(#loc88)
|
| 46 |
+
%tmp0_17 = tt.load %tmp0_14, %tmp0_16, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i32>> loc(#loc89)
|
| 47 |
+
%tmp1 = arith.extsi %tmp0_17 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90)
|
| 48 |
+
%tmp4 = arith.addi %_tmp3_6, %tmp1 : tensor<1x32xi64> loc(#loc91)
|
| 49 |
+
%_tmp3_18 = arith.select %tmp0_16, %tmp4, %_tmp3_6 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92)
|
| 50 |
+
scf.yield %_tmp3_18 : tensor<1x32xi64> loc(#loc17)
|
| 51 |
+
} loc(#loc82)
|
| 52 |
+
%tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({
|
| 53 |
+
^bb0(%tmp3_6: i64 loc(callsite(#loc1 at #loc93)), %tmp3_7: i64 loc(callsite(#loc1 at #loc93))):
|
| 54 |
+
%tmp3_8 = arith.addi %tmp3_6, %tmp3_7 : i64 loc(#loc135)
|
| 55 |
+
tt.reduce.return %tmp3_8 : i64 loc(#loc125)
|
| 56 |
+
}) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125)
|
| 57 |
+
%tmp3_5 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94)
|
| 58 |
+
%tmp5 = arith.trunci %tmp3_5 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95)
|
| 59 |
+
%0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc23)
|
| 60 |
+
%1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc23)
|
| 61 |
+
tt.store %1, %tmp5, %xmask_3 : tensor<1x1x!tt.ptr<i32>> loc(#loc24)
|
| 62 |
+
scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 {
|
| 63 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96)
|
| 64 |
+
%r0_index_6 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc96)
|
| 65 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97)
|
| 66 |
+
%r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x32xi32> loc(#loc97)
|
| 67 |
+
%tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98)
|
| 68 |
+
%tmp6_8 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99)
|
| 69 |
+
%tmp6_9 = arith.extui %tmp6_8 : i1 to i64 loc(#loc100)
|
| 70 |
+
%tmp6_10 = arith.muli %ks0, %tmp6_9 : i64 loc(#loc100)
|
| 71 |
+
%tmp6_11 = arith.extui %tmp6 : i1 to i64 loc(#loc127)
|
| 72 |
+
%tmp6_12 = arith.addi %tmp6_11, %tmp6_10 : i64 loc(#loc101)
|
| 73 |
+
%tmp6_13 = arith.extsi %xoffset : i32 to i64 loc(#loc103)
|
| 74 |
+
%tmp6_14 = arith.muli %tmp6_13, %tmp6_12 : i64 loc(#loc103)
|
| 75 |
+
%tmp6_15 = arith.extsi %r0_index_6 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104)
|
| 76 |
+
%tmp6_16 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc128)
|
| 77 |
+
%tmp6_17 = arith.addi %tmp6_15, %tmp6_16 : tensor<1x32xi64> loc(#loc104)
|
| 78 |
+
%tmp6_18 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1x32x!tt.ptr<i64>> loc(#loc105)
|
| 79 |
+
%tmp6_19 = tt.addptr %tmp6_18, %tmp6_17 : tensor<1x32x!tt.ptr<i64>>, tensor<1x32xi64> loc(#loc105)
|
| 80 |
+
%tmp6_20 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc129)
|
| 81 |
+
%tmp6_21 = arith.andi %r0_mask_7, %tmp6_20 : tensor<1x32xi1> loc(#loc106)
|
| 82 |
+
%tmp6_22 = tt.load %tmp6_19, %tmp6_21, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr<i64>> loc(#loc107)
|
| 83 |
+
%tmp7 = arith.trunci %tmp6_22 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108)
|
| 84 |
+
%tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109)
|
| 85 |
+
%tmp9_23 = arith.cmpi slt, %r0_index_6, %tmp9 : tensor<1x32xi32> loc(#loc109)
|
| 86 |
+
%tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110)
|
| 87 |
+
%tmp11_24 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110)
|
| 88 |
+
%tmp11_25 = arith.select %tmp9_23, %tmp11, %tmp11_24 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110)
|
| 89 |
+
%tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111)
|
| 90 |
+
%tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112)
|
| 91 |
+
%tmp13_26 = arith.addi %tmp11_25, %tmp13 : tensor<1x32xi64> loc(#loc112)
|
| 92 |
+
%tmp14 = arith.cmpi slt, %tmp11_25, %cst_2 : tensor<1x32xi64> loc(#loc113)
|
| 93 |
+
%tmp15 = arith.select %tmp14, %tmp13_26, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114)
|
| 94 |
+
%2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45)
|
| 95 |
+
%3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46)
|
| 96 |
+
%quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130)
|
| 97 |
+
%remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131)
|
| 98 |
+
%fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132)
|
| 99 |
+
%fixed_27 = arith.subi %quot, %c1_i64 : i64 loc(#loc133)
|
| 100 |
+
%fixed_28 = arith.select %fixed, %fixed_27, %quot : i64 loc(#loc134)
|
| 101 |
+
%4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120)
|
| 102 |
+
%5 = arith.select %4, %fixed_28, %quot : i64 loc(#loc121)
|
| 103 |
+
%6 = arith.addi %5, %c1_i64 : i64 loc(#loc55)
|
| 104 |
+
%7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56)
|
| 105 |
+
%8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56)
|
| 106 |
+
%9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57)
|
| 107 |
+
%10 = arith.xori %tmp6_21, %cst_1 : tensor<1x32xi1> loc(#loc58)
|
| 108 |
+
%11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59)
|
| 109 |
+
tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60)
|
| 110 |
+
%12 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>> loc(#loc61)
|
| 111 |
+
%13 = tt.addptr %12, %tmp6_17 : tensor<1x32x!tt.ptr<i32>>, tensor<1x32xi64> loc(#loc61)
|
| 112 |
+
tt.store %13, %tmp7, %tmp6_21 : tensor<1x32x!tt.ptr<i32>> loc(#loc62)
|
| 113 |
+
%14 = tt.splat %tmp6_13 : i64 -> tensor<1x32xi64> loc(#loc63)
|
| 114 |
+
%15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63)
|
| 115 |
+
%16 = arith.muli %ks0, %tmp6_13 : i64 loc(#loc64)
|
| 116 |
+
%17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122)
|
| 117 |
+
%18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65)
|
| 118 |
+
%19 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<1x32x!tt.ptr<i32>> loc(#loc66)
|
| 119 |
+
%20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr<i32>>, tensor<1x32xi64> loc(#loc66)
|
| 120 |
+
tt.store %20, %cst_0, %tmp6_21 : tensor<1x32x!tt.ptr<i32>> loc(#loc67)
|
| 121 |
+
} loc(#loc25)
|
| 122 |
+
tt.return loc(#loc68)
|
| 123 |
+
} loc(#loc)
|
| 124 |
+
} loc(#loc)
|
| 125 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":22:28)
|
| 126 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":24:21)
|
| 127 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:27)
|
| 128 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":25:37)
|
| 129 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":29:40)
|
| 130 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":30:31)
|
| 131 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":31:29)
|
| 132 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:45)
|
| 133 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:41)
|
| 134 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:34)
|
| 135 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:60)
|
| 136 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":35:50)
|
| 137 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":36:23)
|
| 138 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":38:23)
|
| 139 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:48)
|
| 140 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":39:8)
|
| 141 |
+
#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 142 |
+
#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 143 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":40:28)
|
| 144 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":41:19)
|
| 145 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:25)
|
| 146 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":42:36)
|
| 147 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:40)
|
| 148 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":44:31)
|
| 149 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":45:29)
|
| 150 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:60)
|
| 151 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:86)
|
| 152 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:77)
|
| 153 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:68)
|
| 154 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:52)
|
| 155 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:45)
|
| 156 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:41)
|
| 157 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:34)
|
| 158 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:103)
|
| 159 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":49:93)
|
| 160 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":50:23)
|
| 161 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":52:22)
|
| 162 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":54:37)
|
| 163 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":55:20)
|
| 164 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":56:24)
|
| 165 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":57:24)
|
| 166 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":58:39)
|
| 167 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:32)
|
| 168 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:94)
|
| 169 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 170 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:100)
|
| 171 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
|
| 172 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 173 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 174 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 175 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 176 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 177 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:55)
|
| 178 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:50)
|
| 179 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:42)
|
| 180 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:112)
|
| 181 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:110)
|
| 182 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":59:130)
|
| 183 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:29)
|
| 184 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":61:94)
|
| 185 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:53)
|
| 186 |
+
#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:62)
|
| 187 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:58)
|
| 188 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:29)
|
| 189 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":62:95)
|
| 190 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ge/cge7ppvv65hasrwq33aa5yjnt4twdzvw3pak2x4bu7n7x2hyexmv.py":43:4)
|
| 191 |
+
#loc78 = loc("xoffset"(#loc2))
|
| 192 |
+
#loc79 = loc("xmask"(#loc3))
|
| 193 |
+
#loc80 = loc("r0_base"(#loc4))
|
| 194 |
+
#loc81 = loc("r0_base"(#loc5))
|
| 195 |
+
#loc82 = loc("_tmp3"(#loc6))
|
| 196 |
+
#loc83 = loc("r0_index"(#loc7))
|
| 197 |
+
#loc84 = loc("r0_mask"(#loc8))
|
| 198 |
+
#loc85 = loc("tmp0"(#loc9))
|
| 199 |
+
#loc86 = loc("tmp0"(#loc10))
|
| 200 |
+
#loc87 = loc("tmp0"(#loc11))
|
| 201 |
+
#loc88 = loc("tmp0"(#loc12))
|
| 202 |
+
#loc89 = loc("tmp0"(#loc13))
|
| 203 |
+
#loc90 = loc("tmp1"(#loc14))
|
| 204 |
+
#loc91 = loc("tmp4"(#loc15))
|
| 205 |
+
#loc92 = loc("_tmp3"(#loc16))
|
| 206 |
+
#loc94 = loc("tmp3"(#loc21))
|
| 207 |
+
#loc95 = loc("tmp5"(#loc22))
|
| 208 |
+
#loc96 = loc("r0_index"(#loc26))
|
| 209 |
+
#loc97 = loc("r0_mask"(#loc27))
|
| 210 |
+
#loc98 = loc("tmp6"(#loc28))
|
| 211 |
+
#loc99 = loc("tmp6"(#loc29))
|
| 212 |
+
#loc100 = loc("tmp6"(#loc30))
|
| 213 |
+
#loc101 = loc("tmp6"(#loc31))
|
| 214 |
+
#loc102 = loc("tmp6"(#loc32))
|
| 215 |
+
#loc103 = loc("tmp6"(#loc33))
|
| 216 |
+
#loc104 = loc("tmp6"(#loc34))
|
| 217 |
+
#loc105 = loc("tmp6"(#loc35))
|
| 218 |
+
#loc106 = loc("tmp6"(#loc36))
|
| 219 |
+
#loc107 = loc("tmp6"(#loc37))
|
| 220 |
+
#loc108 = loc("tmp7"(#loc38))
|
| 221 |
+
#loc109 = loc("tmp9"(#loc39))
|
| 222 |
+
#loc110 = loc("tmp11"(#loc40))
|
| 223 |
+
#loc111 = loc("tmp12"(#loc41))
|
| 224 |
+
#loc112 = loc("tmp13"(#loc42))
|
| 225 |
+
#loc113 = loc("tmp14"(#loc43))
|
| 226 |
+
#loc114 = loc("tmp15"(#loc44))
|
| 227 |
+
#loc115 = loc("quot"(#loc47))
|
| 228 |
+
#loc116 = loc("remainder"(#loc49))
|
| 229 |
+
#loc117 = loc("fixed"(#loc50))
|
| 230 |
+
#loc118 = loc("fixed"(#loc51))
|
| 231 |
+
#loc119 = loc("fixed"(#loc52))
|
| 232 |
+
#loc120 = loc(callsite(#loc53 at #loc48))
|
| 233 |
+
#loc121 = loc(callsite(#loc54 at #loc48))
|
| 234 |
+
#loc122 = loc(fused[#loc65, #loc64])
|
| 235 |
+
#loc123 = loc(fused[#loc86, #loc85])
|
| 236 |
+
#loc124 = loc(fused[#loc88, #loc79])
|
| 237 |
+
#loc125 = loc(callsite(#loc18 at #loc93))
|
| 238 |
+
#loc127 = loc(fused[#loc101, #loc102])
|
| 239 |
+
#loc128 = loc(fused[#loc104, #loc103])
|
| 240 |
+
#loc129 = loc(fused[#loc106, #loc79])
|
| 241 |
+
#loc130 = loc(callsite(#loc115 at #loc48))
|
| 242 |
+
#loc131 = loc(callsite(#loc116 at #loc48))
|
| 243 |
+
#loc132 = loc(callsite(#loc117 at #loc48))
|
| 244 |
+
#loc133 = loc(callsite(#loc118 at #loc48))
|
| 245 |
+
#loc134 = loc(callsite(#loc119 at #loc48))
|
| 246 |
+
#loc135 = loc(callsite(#loc20 at #loc125))
|
SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "07c7815d2ce5fa33e16044674f04a1dcbb415776e0b5f0da0149af801b6db42c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"}
|
SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
|
| 3 |
+
#linear = #ttg.linear<{register = [[0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 1], [0, 2]], block = []}>
|
| 4 |
+
#linear1 = #ttg.linear<{register = [[2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[0, 1, 0], [1, 0, 0]], block = []}>
|
| 5 |
+
#linear2 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[0, 0, 1], [0, 1, 0]], block = []}>
|
| 6 |
+
#linear3 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}>
|
| 7 |
+
#linear4 = #ttg.linear<{register = [[0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}>
|
| 8 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0)
|
| 9 |
+
#loc1 = loc(unknown)
|
| 10 |
+
#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
|
| 11 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67)
|
| 12 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
|
| 13 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
|
| 14 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
|
| 15 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
|
| 16 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
|
| 17 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26)
|
| 18 |
+
#loc77 = loc("in_ptr0"(#loc))
|
| 19 |
+
#loc78 = loc("out_ptr2"(#loc))
|
| 20 |
+
#loc79 = loc("out_ptr3"(#loc))
|
| 21 |
+
#loc80 = loc("xnumel"(#loc))
|
| 22 |
+
#loc81 = loc("r0_numel"(#loc))
|
| 23 |
+
#loc99 = loc(callsite(#loc19 at #loc20))
|
| 24 |
+
#loc105 = loc("ileft"(#loc28))
|
| 25 |
+
#loc109 = loc("iright"(#loc33))
|
| 26 |
+
#loc118 = loc("left_idx"(#loc42))
|
| 27 |
+
#loc123 = loc("right_idx"(#loc47))
|
| 28 |
+
#loc143 = loc("tmp11"(#loc67))
|
| 29 |
+
#loc149 = loc(callsite(#loc24 at #loc99))
|
| 30 |
+
#loc153 = loc(callsite(#loc1 at #loc143))
|
| 31 |
+
#loc157 = loc(callsite(#loc105 at #loc149))
|
| 32 |
+
#loc161 = loc(callsite(#loc109 at #loc149))
|
| 33 |
+
#loc169 = loc(callsite(#loc118 at #loc149))
|
| 34 |
+
#loc174 = loc(callsite(#loc123 at #loc149))
|
| 35 |
+
#loc194 = loc(callsite(#loc1 at #loc157))
|
| 36 |
+
#loc196 = loc(callsite(#loc1 at #loc161))
|
| 37 |
+
#loc199 = loc(callsite(#loc1 at #loc169))
|
| 38 |
+
#loc202 = loc(callsite(#loc1 at #loc174))
|
| 39 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 40 |
+
tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 41 |
+
%cst = arith.constant dense<0> : tensor<32x16xi32, #linear> loc(#loc1)
|
| 42 |
+
%cst_0 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1)
|
| 43 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 44 |
+
%cst_1 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1)
|
| 45 |
+
%cst_2 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1)
|
| 46 |
+
%cst_3 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1)
|
| 47 |
+
%cst_4 = arith.constant dense<16> : tensor<32x1xi32, #blocked1> loc(#loc1)
|
| 48 |
+
%cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1)
|
| 49 |
+
%cst_6 = arith.constant dense<272> : tensor<32x1xi32, #blocked> loc(#loc1)
|
| 50 |
+
%cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1)
|
| 51 |
+
%cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1)
|
| 52 |
+
%cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1)
|
| 53 |
+
%cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1)
|
| 54 |
+
%cst_11 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1)
|
| 55 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc82)
|
| 56 |
+
%xoffset_12 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc83)
|
| 57 |
+
%xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84)
|
| 58 |
+
%xindex_13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84)
|
| 59 |
+
%xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc84)
|
| 60 |
+
%xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc84)
|
| 61 |
+
%xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked> loc(#loc85)
|
| 62 |
+
%xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc85)
|
| 63 |
+
%xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<32x1xi32, #blocked> loc(#loc85)
|
| 64 |
+
%xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<32x1xi32, #blocked1> loc(#loc85)
|
| 65 |
+
%xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<32x1xi32, #blocked> loc(#loc86)
|
| 66 |
+
%xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc86)
|
| 67 |
+
%r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87)
|
| 68 |
+
%r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87)
|
| 69 |
+
%r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87)
|
| 70 |
+
%r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87)
|
| 71 |
+
%r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87)
|
| 72 |
+
%r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87)
|
| 73 |
+
%x0 = arith.remsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc88)
|
| 74 |
+
%x1 = arith.divsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc89)
|
| 75 |
+
%tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90)
|
| 76 |
+
%tmp0_26 = tt.broadcast %x0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91)
|
| 77 |
+
%tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91)
|
| 78 |
+
%tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<32x16xi32, #blocked> loc(#loc91)
|
| 79 |
+
%tmp0_29 = arith.muli %x1, %cst_6 : tensor<32x1xi32, #blocked> loc(#loc92)
|
| 80 |
+
%tmp0_30 = tt.broadcast %tmp0_29 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc93)
|
| 81 |
+
%tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<32x16xi32, #blocked> loc(#loc93)
|
| 82 |
+
%tmp0_32 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc94)
|
| 83 |
+
%tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<32x16x!tt.ptr<i32>, #blocked>, tensor<32x16xi32, #blocked> loc(#loc94)
|
| 84 |
+
%tmp0_34 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc95)
|
| 85 |
+
%tmp0_35 = tt.broadcast %xmask_20 : tensor<32x1xi1, #blocked1> -> tensor<32x16xi1, #blocked1> loc(#loc95)
|
| 86 |
+
%tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc95)
|
| 87 |
+
%tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96)
|
| 88 |
+
%tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<32x16xi16, #linear> loc(#loc97)
|
| 89 |
+
%flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146)
|
| 90 |
+
%flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146)
|
| 91 |
+
%flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146)
|
| 92 |
+
%flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146)
|
| 93 |
+
%flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146)
|
| 94 |
+
%flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146)
|
| 95 |
+
%flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146)
|
| 96 |
+
%flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146)
|
| 97 |
+
%flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146)
|
| 98 |
+
%flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146)
|
| 99 |
+
%flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146)
|
| 100 |
+
%flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146)
|
| 101 |
+
%flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc147)
|
| 102 |
+
%flip_49 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #blocked> loc(#loc148)
|
| 103 |
+
%flip_50 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc148)
|
| 104 |
+
%y = tt.reshape %tmp0_36 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
|
| 105 |
+
%left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155)
|
| 106 |
+
%left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155)
|
| 107 |
+
%left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155)
|
| 108 |
+
%left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155)
|
| 109 |
+
%ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc156)
|
| 110 |
+
%ileft_54 = arith.muli %y, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
|
| 111 |
+
%ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({
|
| 112 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 113 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 114 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 115 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 116 |
+
%ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
|
| 117 |
+
%ileft_57 = tt.broadcast %ileft_56 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
|
| 118 |
+
%iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc160)
|
| 119 |
+
%iright_58 = arith.muli %y, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
|
| 120 |
+
%iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({
|
| 121 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 122 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 123 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 124 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 125 |
+
%iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
|
| 126 |
+
%iright_61 = tt.broadcast %iright_60 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
|
| 127 |
+
%ileft_62 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc164)
|
| 128 |
+
%ileft_63 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 129 |
+
%iright_64 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc165)
|
| 130 |
+
%iright_65 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 131 |
+
%y_idx = tt.reshape %tmp4 : tensor<32x16xi16, #linear> -> tensor<256x2x1xi16, #linear1> loc(#loc166)
|
| 132 |
+
%left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167)
|
| 133 |
+
%left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc168)
|
| 134 |
+
%left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<256x2x1xi16, #linear1> loc(#loc168)
|
| 135 |
+
%input = arith.extsi %left_idx_67 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc197)
|
| 136 |
+
%left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 137 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 138 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 139 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 140 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 141 |
+
%left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
|
| 142 |
+
%left_idx_70 = tt.broadcast %left_idx_69 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
|
| 143 |
+
%right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172)
|
| 144 |
+
%right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc173)
|
| 145 |
+
%right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<256x2x1xi16, #linear1> loc(#loc173)
|
| 146 |
+
%input_73 = arith.extsi %right_idx_72 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc200)
|
| 147 |
+
%right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({
|
| 148 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 149 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 150 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 151 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 152 |
+
%right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
|
| 153 |
+
%right_idx_76 = tt.broadcast %right_idx_75 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
|
| 154 |
+
%left_idx_77 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc177)
|
| 155 |
+
%left_idx_78 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 156 |
+
%right_idx_79 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc178)
|
| 157 |
+
%right_idx_80 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 158 |
+
%cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc179)
|
| 159 |
+
%cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 160 |
+
%eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc180)
|
| 161 |
+
%eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 162 |
+
%cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<32x16xi32, #blocked> loc(#loc181)
|
| 163 |
+
%cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 164 |
+
%cond_85 = arith.andi %eq, %cond_83 : tensor<32x16xi1, #blocked> loc(#loc182)
|
| 165 |
+
%cond_86 = arith.andi %eq_82, %cond_84 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 166 |
+
%cond_87 = arith.ori %cond, %cond_85 : tensor<32x16xi1, #blocked> loc(#loc183)
|
| 167 |
+
%cond_88 = arith.ori %cond_81, %cond_86 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 168 |
+
%cond_89 = arith.extui %cond_87 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc184)
|
| 169 |
+
%cond_90 = arith.extui %cond_88 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
|
| 170 |
+
%cond_91 = arith.xori %cond_89, %flip_49 : tensor<32x16xi32, #blocked> loc(#loc184)
|
| 171 |
+
%cond_92 = arith.xori %cond_90, %flip_50 : tensor<32x16xi32, #linear> loc(#loc184)
|
| 172 |
+
%cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<32x16xi32, #blocked> loc(#loc185)
|
| 173 |
+
%cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<32x16xi32, #linear> loc(#loc185)
|
| 174 |
+
%ret = arith.xori %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc186)
|
| 175 |
+
%ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc187)
|
| 176 |
+
%ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<32x16xi32, #blocked> loc(#loc188)
|
| 177 |
+
%ret_97 = ttg.convert_layout %ret_96 : tensor<32x16xi32, #blocked> -> tensor<32x16xi32, #linear> loc(#loc188)
|
| 178 |
+
%new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 179 |
+
%new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 180 |
+
%new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191)
|
| 181 |
+
%new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<32x16xi32, #linear> loc(#loc191)
|
| 182 |
+
%new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 183 |
+
%flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc147)
|
| 184 |
+
%flip_103 = tt.reshape %flip_102 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc148)
|
| 185 |
+
%y_104 = tt.reshape %ret_96 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #linear2> loc(#loc154)
|
| 186 |
+
%ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc156)
|
| 187 |
+
%ileft_106 = arith.muli %y_104, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156)
|
| 188 |
+
%ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({
|
| 189 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 190 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 191 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 192 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
|
| 193 |
+
%ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158)
|
| 194 |
+
%ileft_109 = tt.broadcast %ileft_108 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159)
|
| 195 |
+
%iright_110 = arith.muli %y_104, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160)
|
| 196 |
+
%iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({
|
| 197 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 198 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 199 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 200 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
|
| 201 |
+
%iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162)
|
| 202 |
+
%iright_113 = tt.broadcast %iright_112 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163)
|
| 203 |
+
%ileft_114 = tt.reshape %ileft_109 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 204 |
+
%iright_115 = tt.reshape %iright_113 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 205 |
+
%y_idx_116 = tt.reshape %new_idxs_101 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166)
|
| 206 |
+
%left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168)
|
| 207 |
+
%left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({
|
| 208 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 209 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 210 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 211 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
|
| 212 |
+
%left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170)
|
| 213 |
+
%left_idx_120 = tt.broadcast %left_idx_119 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171)
|
| 214 |
+
%right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173)
|
| 215 |
+
%right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({
|
| 216 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 217 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 218 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 219 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
|
| 220 |
+
%right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175)
|
| 221 |
+
%right_idx_124 = tt.broadcast %right_idx_123 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176)
|
| 222 |
+
%left_idx_125 = tt.reshape %left_idx_120 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 223 |
+
%right_idx_126 = tt.reshape %right_idx_124 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 224 |
+
%cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 225 |
+
%eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 226 |
+
%cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 227 |
+
%cond_130 = arith.andi %eq_128, %cond_129 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 228 |
+
%cond_131 = arith.ori %cond_127, %cond_130 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 229 |
+
%cond_132 = arith.extui %cond_131 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
|
| 230 |
+
%cond_133 = arith.xori %cond_132, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184)
|
| 231 |
+
%cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<32x16xi32, #linear> loc(#loc185)
|
| 232 |
+
%ret_135 = arith.xori %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 233 |
+
%ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 234 |
+
%ret_137 = arith.xori %ret_97, %ret_136 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 235 |
+
%new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 236 |
+
%new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 237 |
+
%new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 238 |
+
%y_141 = tt.reshape %ret_137 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
|
| 239 |
+
%ileft_142 = arith.muli %y_141, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
|
| 240 |
+
%ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({
|
| 241 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 242 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 243 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 244 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 245 |
+
%ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
|
| 246 |
+
%ileft_145 = tt.broadcast %ileft_144 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
|
| 247 |
+
%iright_146 = arith.muli %y_141, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
|
| 248 |
+
%iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({
|
| 249 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 250 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 251 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 252 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 253 |
+
%iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
|
| 254 |
+
%iright_149 = tt.broadcast %iright_148 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
|
| 255 |
+
%ileft_150 = tt.reshape %ileft_145 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 256 |
+
%iright_151 = tt.reshape %iright_149 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 257 |
+
%y_idx_152 = tt.reshape %new_idxs_140 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166)
|
| 258 |
+
%left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168)
|
| 259 |
+
%left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({
|
| 260 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 261 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 262 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 263 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 264 |
+
%left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
|
| 265 |
+
%left_idx_156 = tt.broadcast %left_idx_155 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
|
| 266 |
+
%right_idx_157 = arith.muli %y_idx_152, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173)
|
| 267 |
+
%right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({
|
| 268 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 269 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 270 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 271 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 272 |
+
%right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
|
| 273 |
+
%right_idx_160 = tt.broadcast %right_idx_159 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
|
| 274 |
+
%left_idx_161 = tt.reshape %left_idx_156 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 275 |
+
%right_idx_162 = tt.reshape %right_idx_160 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 276 |
+
%cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 277 |
+
%eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 278 |
+
%cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 279 |
+
%cond_166 = arith.andi %eq_164, %cond_165 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 280 |
+
%cond_167 = arith.ori %cond_163, %cond_166 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 281 |
+
%cond_168 = arith.extui %cond_167 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
|
| 282 |
+
%cond_169 = arith.xori %cond_168, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184)
|
| 283 |
+
%cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<32x16xi32, #linear> loc(#loc185)
|
| 284 |
+
%ret_171 = arith.xori %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 285 |
+
%ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 286 |
+
%ret_173 = arith.xori %ret_137, %ret_172 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 287 |
+
%new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 288 |
+
%new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 289 |
+
%new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 290 |
+
%flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc147)
|
| 291 |
+
%flip_178 = tt.reshape %flip_177 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc148)
|
| 292 |
+
%y_179 = tt.reshape %ret_173 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154)
|
| 293 |
+
%ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc156)
|
| 294 |
+
%ileft_181 = arith.muli %y_179, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156)
|
| 295 |
+
%ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({
|
| 296 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 297 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 298 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 299 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
|
| 300 |
+
%ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158)
|
| 301 |
+
%ileft_184 = tt.broadcast %ileft_183 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159)
|
| 302 |
+
%iright_185 = arith.muli %y_179, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160)
|
| 303 |
+
%iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({
|
| 304 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 305 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 306 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 307 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
|
| 308 |
+
%iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162)
|
| 309 |
+
%iright_188 = tt.broadcast %iright_187 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163)
|
| 310 |
+
%ileft_189 = tt.reshape %ileft_184 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 311 |
+
%iright_190 = tt.reshape %iright_188 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 312 |
+
%y_idx_191 = tt.reshape %new_idxs_176 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166)
|
| 313 |
+
%left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168)
|
| 314 |
+
%left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({
|
| 315 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 316 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 317 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 318 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
|
| 319 |
+
%left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170)
|
| 320 |
+
%left_idx_195 = tt.broadcast %left_idx_194 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171)
|
| 321 |
+
%right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173)
|
| 322 |
+
%right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({
|
| 323 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 324 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 325 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 326 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
|
| 327 |
+
%right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175)
|
| 328 |
+
%right_idx_199 = tt.broadcast %right_idx_198 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176)
|
| 329 |
+
%left_idx_200 = tt.reshape %left_idx_195 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 330 |
+
%right_idx_201 = tt.reshape %right_idx_199 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 331 |
+
%cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 332 |
+
%eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 333 |
+
%cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 334 |
+
%cond_205 = arith.andi %eq_203, %cond_204 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 335 |
+
%cond_206 = arith.ori %cond_202, %cond_205 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 336 |
+
%cond_207 = arith.extui %cond_206 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
|
| 337 |
+
%cond_208 = arith.xori %cond_207, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184)
|
| 338 |
+
%cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<32x16xi32, #linear> loc(#loc185)
|
| 339 |
+
%ret_210 = arith.xori %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 340 |
+
%ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 341 |
+
%ret_212 = arith.xori %ret_173, %ret_211 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 342 |
+
%new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 343 |
+
%new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 344 |
+
%new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 345 |
+
%y_216 = tt.reshape %ret_212 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154)
|
| 346 |
+
%ileft_217 = arith.muli %y_216, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156)
|
| 347 |
+
%ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({
|
| 348 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 349 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 350 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 351 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
|
| 352 |
+
%ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158)
|
| 353 |
+
%ileft_220 = tt.broadcast %ileft_219 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159)
|
| 354 |
+
%iright_221 = arith.muli %y_216, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160)
|
| 355 |
+
%iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({
|
| 356 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 357 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 358 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 359 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
|
| 360 |
+
%iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162)
|
| 361 |
+
%iright_224 = tt.broadcast %iright_223 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163)
|
| 362 |
+
%ileft_225 = tt.reshape %ileft_220 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 363 |
+
%iright_226 = tt.reshape %iright_224 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 364 |
+
%y_idx_227 = tt.reshape %new_idxs_215 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166)
|
| 365 |
+
%left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168)
|
| 366 |
+
%left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({
|
| 367 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 368 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 369 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 370 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
|
| 371 |
+
%left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170)
|
| 372 |
+
%left_idx_231 = tt.broadcast %left_idx_230 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171)
|
| 373 |
+
%right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173)
|
| 374 |
+
%right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({
|
| 375 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 376 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 377 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 378 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
|
| 379 |
+
%right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175)
|
| 380 |
+
%right_idx_235 = tt.broadcast %right_idx_234 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176)
|
| 381 |
+
%left_idx_236 = tt.reshape %left_idx_231 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 382 |
+
%right_idx_237 = tt.reshape %right_idx_235 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 383 |
+
%cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 384 |
+
%eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 385 |
+
%cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 386 |
+
%cond_241 = arith.andi %eq_239, %cond_240 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 387 |
+
%cond_242 = arith.ori %cond_238, %cond_241 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 388 |
+
%cond_243 = arith.extui %cond_242 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
|
| 389 |
+
%cond_244 = arith.xori %cond_243, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184)
|
| 390 |
+
%cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<32x16xi32, #linear> loc(#loc185)
|
| 391 |
+
%ret_246 = arith.xori %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 392 |
+
%ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 393 |
+
%ret_248 = arith.xori %ret_212, %ret_247 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 394 |
+
%new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 395 |
+
%new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 396 |
+
%new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 397 |
+
%y_252 = tt.reshape %ret_248 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
|
| 398 |
+
%ileft_253 = arith.muli %y_252, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
|
| 399 |
+
%ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({
|
| 400 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 401 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 402 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 403 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 404 |
+
%ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
|
| 405 |
+
%ileft_256 = tt.broadcast %ileft_255 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
|
| 406 |
+
%iright_257 = arith.muli %y_252, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
|
| 407 |
+
%iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({
|
| 408 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 409 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 410 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 411 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 412 |
+
%iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
|
| 413 |
+
%iright_260 = tt.broadcast %iright_259 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
|
| 414 |
+
%ileft_261 = tt.reshape %ileft_256 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 415 |
+
%iright_262 = tt.reshape %iright_260 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 416 |
+
%y_idx_263 = tt.reshape %new_idxs_251 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166)
|
| 417 |
+
%left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168)
|
| 418 |
+
%left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({
|
| 419 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 420 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 421 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 422 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 423 |
+
%left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
|
| 424 |
+
%left_idx_267 = tt.broadcast %left_idx_266 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
|
| 425 |
+
%right_idx_268 = arith.muli %y_idx_263, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173)
|
| 426 |
+
%right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({
|
| 427 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 428 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 429 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 430 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 431 |
+
%right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
|
| 432 |
+
%right_idx_271 = tt.broadcast %right_idx_270 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
|
| 433 |
+
%left_idx_272 = tt.reshape %left_idx_267 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 434 |
+
%right_idx_273 = tt.reshape %right_idx_271 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 435 |
+
%cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 436 |
+
%eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 437 |
+
%cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 438 |
+
%cond_277 = arith.andi %eq_275, %cond_276 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 439 |
+
%cond_278 = arith.ori %cond_274, %cond_277 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 440 |
+
%cond_279 = arith.extui %cond_278 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
|
| 441 |
+
%cond_280 = arith.xori %cond_279, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184)
|
| 442 |
+
%cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<32x16xi32, #linear> loc(#loc185)
|
| 443 |
+
%ret_282 = arith.xori %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 444 |
+
%ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 445 |
+
%ret_284 = arith.xori %ret_248, %ret_283 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 446 |
+
%new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 447 |
+
%new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 448 |
+
%new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 449 |
+
%y_288 = tt.reshape %ret_284 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc154)
|
| 450 |
+
%ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc156)
|
| 451 |
+
%ileft_290 = arith.muli %y_288, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc156)
|
| 452 |
+
%ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({
|
| 453 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 454 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 455 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 456 |
+
}) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193)
|
| 457 |
+
%ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc158)
|
| 458 |
+
%ileft_293 = tt.broadcast %ileft_292 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc159)
|
| 459 |
+
%iright_294 = arith.muli %y_288, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc160)
|
| 460 |
+
%iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({
|
| 461 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 462 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 463 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 464 |
+
}) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195)
|
| 465 |
+
%iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc162)
|
| 466 |
+
%iright_297 = tt.broadcast %iright_296 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc163)
|
| 467 |
+
%ileft_298 = tt.reshape %ileft_293 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 468 |
+
%iright_299 = tt.reshape %iright_297 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 469 |
+
%y_idx_300 = tt.reshape %new_idxs_287 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc166)
|
| 470 |
+
%left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc168)
|
| 471 |
+
%left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({
|
| 472 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 473 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 474 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 475 |
+
}) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198)
|
| 476 |
+
%left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc170)
|
| 477 |
+
%left_idx_304 = tt.broadcast %left_idx_303 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc171)
|
| 478 |
+
%right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc173)
|
| 479 |
+
%right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({
|
| 480 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 481 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 482 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 483 |
+
}) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201)
|
| 484 |
+
%right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc175)
|
| 485 |
+
%right_idx_308 = tt.broadcast %right_idx_307 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc176)
|
| 486 |
+
%left_idx_309 = tt.reshape %left_idx_304 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 487 |
+
%right_idx_310 = tt.reshape %right_idx_308 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 488 |
+
%cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 489 |
+
%eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 490 |
+
%cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 491 |
+
%cond_314 = arith.andi %eq_312, %cond_313 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 492 |
+
%cond_315 = arith.ori %cond_311, %cond_314 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 493 |
+
%ret_316 = arith.xori %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 494 |
+
%ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 495 |
+
%ret_318 = arith.xori %ret_284, %ret_317 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 496 |
+
%new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 497 |
+
%new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 498 |
+
%new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 499 |
+
%y_322 = tt.reshape %ret_318 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154)
|
| 500 |
+
%ileft_323 = arith.muli %y_322, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156)
|
| 501 |
+
%ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({
|
| 502 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 503 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 504 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 505 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
|
| 506 |
+
%ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158)
|
| 507 |
+
%ileft_326 = tt.broadcast %ileft_325 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159)
|
| 508 |
+
%iright_327 = arith.muli %y_322, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160)
|
| 509 |
+
%iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({
|
| 510 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 511 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 512 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 513 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
|
| 514 |
+
%iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162)
|
| 515 |
+
%iright_330 = tt.broadcast %iright_329 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163)
|
| 516 |
+
%ileft_331 = tt.reshape %ileft_326 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 517 |
+
%iright_332 = tt.reshape %iright_330 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 518 |
+
%y_idx_333 = tt.reshape %new_idxs_321 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166)
|
| 519 |
+
%left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168)
|
| 520 |
+
%left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({
|
| 521 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 522 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 523 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 524 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
|
| 525 |
+
%left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170)
|
| 526 |
+
%left_idx_337 = tt.broadcast %left_idx_336 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171)
|
| 527 |
+
%right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173)
|
| 528 |
+
%right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({
|
| 529 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 530 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 531 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 532 |
+
}) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
|
| 533 |
+
%right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175)
|
| 534 |
+
%right_idx_341 = tt.broadcast %right_idx_340 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176)
|
| 535 |
+
%left_idx_342 = tt.reshape %left_idx_337 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 536 |
+
%right_idx_343 = tt.reshape %right_idx_341 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 537 |
+
%cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 538 |
+
%eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 539 |
+
%cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 540 |
+
%cond_347 = arith.andi %eq_345, %cond_346 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 541 |
+
%cond_348 = arith.ori %cond_344, %cond_347 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 542 |
+
%ret_349 = arith.xori %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 543 |
+
%ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 544 |
+
%ret_351 = arith.xori %ret_318, %ret_350 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 545 |
+
%new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 546 |
+
%new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 547 |
+
%new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 548 |
+
%y_355 = tt.reshape %ret_351 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154)
|
| 549 |
+
%ileft_356 = arith.muli %y_355, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156)
|
| 550 |
+
%ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({
|
| 551 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 552 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 553 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 554 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
|
| 555 |
+
%ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158)
|
| 556 |
+
%ileft_359 = tt.broadcast %ileft_358 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159)
|
| 557 |
+
%iright_360 = arith.muli %y_355, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160)
|
| 558 |
+
%iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({
|
| 559 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 560 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 561 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 562 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
|
| 563 |
+
%iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162)
|
| 564 |
+
%iright_363 = tt.broadcast %iright_362 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163)
|
| 565 |
+
%ileft_364 = tt.reshape %ileft_359 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 566 |
+
%iright_365 = tt.reshape %iright_363 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 567 |
+
%y_idx_366 = tt.reshape %new_idxs_354 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166)
|
| 568 |
+
%left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168)
|
| 569 |
+
%left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({
|
| 570 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 571 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 572 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 573 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
|
| 574 |
+
%left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170)
|
| 575 |
+
%left_idx_370 = tt.broadcast %left_idx_369 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171)
|
| 576 |
+
%right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173)
|
| 577 |
+
%right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({
|
| 578 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 579 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 580 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 581 |
+
}) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
|
| 582 |
+
%right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175)
|
| 583 |
+
%right_idx_374 = tt.broadcast %right_idx_373 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176)
|
| 584 |
+
%left_idx_375 = tt.reshape %left_idx_370 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 585 |
+
%right_idx_376 = tt.reshape %right_idx_374 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 586 |
+
%cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 587 |
+
%eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 588 |
+
%cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 589 |
+
%cond_380 = arith.andi %eq_378, %cond_379 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 590 |
+
%cond_381 = arith.ori %cond_377, %cond_380 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 591 |
+
%ret_382 = arith.xori %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc186)
|
| 592 |
+
%ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
|
| 593 |
+
%ret_384 = arith.xori %ret_351, %ret_383 : tensor<32x16xi32, #linear> loc(#loc188)
|
| 594 |
+
%new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 595 |
+
%new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 596 |
+
%new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 597 |
+
%y_388 = tt.reshape %ret_384 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
|
| 598 |
+
%ileft_389 = arith.muli %y_388, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
|
| 599 |
+
%ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({
|
| 600 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 601 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 602 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 603 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 604 |
+
%ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
|
| 605 |
+
%ileft_392 = tt.broadcast %ileft_391 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
|
| 606 |
+
%iright_393 = arith.muli %y_388, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
|
| 607 |
+
%iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({
|
| 608 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 609 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 610 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 611 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 612 |
+
%iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
|
| 613 |
+
%iright_396 = tt.broadcast %iright_395 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
|
| 614 |
+
%ileft_397 = tt.reshape %ileft_392 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
|
| 615 |
+
%iright_398 = tt.reshape %iright_396 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
|
| 616 |
+
%y_idx_399 = tt.reshape %new_idxs_387 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166)
|
| 617 |
+
%left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168)
|
| 618 |
+
%left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({
|
| 619 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 620 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 621 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 622 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 623 |
+
%left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
|
| 624 |
+
%left_idx_403 = tt.broadcast %left_idx_402 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
|
| 625 |
+
%right_idx_404 = arith.muli %y_idx_399, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173)
|
| 626 |
+
%right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({
|
| 627 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 628 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 629 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 630 |
+
}) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 631 |
+
%right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
|
| 632 |
+
%right_idx_407 = tt.broadcast %right_idx_406 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
|
| 633 |
+
%left_idx_408 = tt.reshape %left_idx_403 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
|
| 634 |
+
%right_idx_409 = tt.reshape %right_idx_407 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
|
| 635 |
+
%cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc179)
|
| 636 |
+
%eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc180)
|
| 637 |
+
%cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc181)
|
| 638 |
+
%cond_413 = arith.andi %eq_411, %cond_412 : tensor<32x16xi1, #linear> loc(#loc182)
|
| 639 |
+
%cond_414 = arith.ori %cond_410, %cond_413 : tensor<32x16xi1, #linear> loc(#loc183)
|
| 640 |
+
%new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc189)
|
| 641 |
+
%new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
|
| 642 |
+
%new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<32x16xi32, #linear> loc(#loc191)
|
| 643 |
+
%tmp7 = arith.extsi %tmp0_36 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc141)
|
| 644 |
+
%tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc142)
|
| 645 |
+
%tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({
|
| 646 |
+
^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))):
|
| 647 |
+
%tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192)
|
| 648 |
+
tt.reduce.return %tmp11_421 : i64 loc(#loc152)
|
| 649 |
+
}) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152)
|
| 650 |
+
%tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc144)
|
| 651 |
+
%tmp14 = arith.trunci %tmp11_418 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc145)
|
| 652 |
+
%0 = arith.muli %xindex_19, %cst_4 : tensor<32x1xi32, #blocked1> loc(#loc70)
|
| 653 |
+
%1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71)
|
| 654 |
+
%2 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71)
|
| 655 |
+
%3 = arith.addi %1, %2 : tensor<32x16xi32, #blocked1> loc(#loc71)
|
| 656 |
+
%4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked1> loc(#loc72)
|
| 657 |
+
%5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr<i32>, #blocked1>, tensor<32x16xi32, #blocked1> loc(#loc72)
|
| 658 |
+
%6 = ttg.convert_layout %new_idxs_417 : tensor<32x16xi32, #linear> -> tensor<32x16xi32, #blocked1> loc(#loc73)
|
| 659 |
+
tt.store %5, %6, %tmp0_35 : tensor<32x16x!tt.ptr<i32>, #blocked1> loc(#loc73)
|
| 660 |
+
%7 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc74)
|
| 661 |
+
%8 = tt.addptr %7, %xindex_18 : tensor<32x1x!tt.ptr<i32>, #blocked>, tensor<32x1xi32, #blocked> loc(#loc74)
|
| 662 |
+
tt.store %8, %tmp14, %xmask : tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc75)
|
| 663 |
+
tt.return loc(#loc76)
|
| 664 |
+
} loc(#loc)
|
| 665 |
+
} loc(#loc)
|
| 666 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28)
|
| 667 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:33)
|
| 668 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:44)
|
| 669 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:23)
|
| 670 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21)
|
| 671 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38)
|
| 672 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19)
|
| 673 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19)
|
| 674 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38)
|
| 675 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35)
|
| 676 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49)
|
| 677 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45)
|
| 678 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30)
|
| 679 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54)
|
| 680 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19)
|
| 681 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":40:33)
|
| 682 |
+
#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
|
| 683 |
+
#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
|
| 684 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
|
| 685 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
|
| 686 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
|
| 687 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
|
| 688 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 689 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 690 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
|
| 691 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
|
| 692 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
|
| 693 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
|
| 694 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
|
| 695 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
|
| 696 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
|
| 697 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
|
| 698 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
|
| 699 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
|
| 700 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
|
| 701 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
|
| 702 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
|
| 703 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
|
| 704 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
|
| 705 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
|
| 706 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
|
| 707 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
|
| 708 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
|
| 709 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
|
| 710 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
|
| 711 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
|
| 712 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
|
| 713 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
|
| 714 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
|
| 715 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
|
| 716 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
|
| 717 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
|
| 718 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
|
| 719 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
|
| 720 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
|
| 721 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
|
| 722 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19)
|
| 723 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34)
|
| 724 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29)
|
| 725 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21)
|
| 726 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35)
|
| 727 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32)
|
| 728 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25)
|
| 729 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47)
|
| 730 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25)
|
| 731 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37)
|
| 732 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4)
|
| 733 |
+
#loc82 = loc("xoffset"(#loc2))
|
| 734 |
+
#loc83 = loc("xoffset"(#loc3))
|
| 735 |
+
#loc84 = loc("xindex"(#loc4))
|
| 736 |
+
#loc85 = loc("xindex"(#loc5))
|
| 737 |
+
#loc86 = loc("xmask"(#loc6))
|
| 738 |
+
#loc87 = loc("r0_index"(#loc7))
|
| 739 |
+
#loc88 = loc("x0"(#loc8))
|
| 740 |
+
#loc89 = loc("x1"(#loc9))
|
| 741 |
+
#loc90 = loc("tmp0"(#loc10))
|
| 742 |
+
#loc91 = loc("tmp0"(#loc11))
|
| 743 |
+
#loc92 = loc("tmp0"(#loc12))
|
| 744 |
+
#loc93 = loc("tmp0"(#loc13))
|
| 745 |
+
#loc94 = loc("tmp0"(#loc14))
|
| 746 |
+
#loc95 = loc("tmp0"(#loc15))
|
| 747 |
+
#loc96 = loc("tmp2"(#loc16))
|
| 748 |
+
#loc97 = loc("tmp4"(#loc17))
|
| 749 |
+
#loc98 = loc("flip"(#loc18))
|
| 750 |
+
#loc100 = loc("flip"(#loc21))
|
| 751 |
+
#loc101 = loc("flip"(#loc22))
|
| 752 |
+
#loc102 = loc("y"(#loc23))
|
| 753 |
+
#loc103 = loc("left_mask"(#loc25))
|
| 754 |
+
#loc104 = loc("ileft"(#loc26))
|
| 755 |
+
#loc106 = loc("ileft"(#loc30))
|
| 756 |
+
#loc107 = loc("ileft"(#loc31))
|
| 757 |
+
#loc108 = loc("iright"(#loc32))
|
| 758 |
+
#loc110 = loc("iright"(#loc34))
|
| 759 |
+
#loc111 = loc("iright"(#loc35))
|
| 760 |
+
#loc112 = loc("ileft"(#loc36))
|
| 761 |
+
#loc113 = loc("iright"(#loc37))
|
| 762 |
+
#loc114 = loc("y_idx"(#loc38))
|
| 763 |
+
#loc115 = loc("left_idx"(#loc39))
|
| 764 |
+
#loc116 = loc("left_idx"(#loc40))
|
| 765 |
+
#loc117 = loc("input"(#loc41))
|
| 766 |
+
#loc119 = loc("left_idx"(#loc43))
|
| 767 |
+
#loc120 = loc("left_idx"(#loc44))
|
| 768 |
+
#loc121 = loc("right_idx"(#loc45))
|
| 769 |
+
#loc122 = loc("right_idx"(#loc46))
|
| 770 |
+
#loc124 = loc("right_idx"(#loc48))
|
| 771 |
+
#loc125 = loc("right_idx"(#loc49))
|
| 772 |
+
#loc126 = loc("left_idx"(#loc50))
|
| 773 |
+
#loc127 = loc("right_idx"(#loc51))
|
| 774 |
+
#loc128 = loc("cond"(#loc52))
|
| 775 |
+
#loc129 = loc("eq"(#loc53))
|
| 776 |
+
#loc130 = loc("cond"(#loc54))
|
| 777 |
+
#loc131 = loc("cond"(#loc55))
|
| 778 |
+
#loc132 = loc("cond"(#loc56))
|
| 779 |
+
#loc133 = loc("cond"(#loc57))
|
| 780 |
+
#loc134 = loc("cond"(#loc58))
|
| 781 |
+
#loc135 = loc("ret"(#loc59))
|
| 782 |
+
#loc136 = loc("ret"(#loc60))
|
| 783 |
+
#loc137 = loc("ret"(#loc61))
|
| 784 |
+
#loc138 = loc("new_idxs"(#loc62))
|
| 785 |
+
#loc139 = loc("new_idxs"(#loc63))
|
| 786 |
+
#loc140 = loc("new_idxs"(#loc64))
|
| 787 |
+
#loc141 = loc("tmp7"(#loc65))
|
| 788 |
+
#loc142 = loc("tmp10"(#loc66))
|
| 789 |
+
#loc144 = loc("tmp11"(#loc68))
|
| 790 |
+
#loc145 = loc("tmp14"(#loc69))
|
| 791 |
+
#loc146 = loc(callsite(#loc98 at #loc99))
|
| 792 |
+
#loc147 = loc(callsite(#loc100 at #loc99))
|
| 793 |
+
#loc148 = loc(callsite(#loc101 at #loc99))
|
| 794 |
+
#loc150 = loc("cond"(#loc128))
|
| 795 |
+
#loc151 = loc("eq"(#loc129))
|
| 796 |
+
#loc152 = loc(callsite(#loc27 at #loc143))
|
| 797 |
+
#loc154 = loc(callsite(#loc102 at #loc149))
|
| 798 |
+
#loc155 = loc(callsite(#loc103 at #loc149))
|
| 799 |
+
#loc156 = loc(callsite(#loc104 at #loc149))
|
| 800 |
+
#loc158 = loc(callsite(#loc106 at #loc149))
|
| 801 |
+
#loc159 = loc(callsite(#loc107 at #loc149))
|
| 802 |
+
#loc160 = loc(callsite(#loc108 at #loc149))
|
| 803 |
+
#loc162 = loc(callsite(#loc110 at #loc149))
|
| 804 |
+
#loc163 = loc(callsite(#loc111 at #loc149))
|
| 805 |
+
#loc164 = loc(callsite(#loc112 at #loc149))
|
| 806 |
+
#loc165 = loc(callsite(#loc113 at #loc149))
|
| 807 |
+
#loc166 = loc(callsite(#loc114 at #loc149))
|
| 808 |
+
#loc167 = loc(callsite(#loc115 at #loc149))
|
| 809 |
+
#loc168 = loc(callsite(#loc116 at #loc149))
|
| 810 |
+
#loc170 = loc(callsite(#loc119 at #loc149))
|
| 811 |
+
#loc171 = loc(callsite(#loc120 at #loc149))
|
| 812 |
+
#loc172 = loc(callsite(#loc121 at #loc149))
|
| 813 |
+
#loc173 = loc(callsite(#loc122 at #loc149))
|
| 814 |
+
#loc175 = loc(callsite(#loc124 at #loc149))
|
| 815 |
+
#loc176 = loc(callsite(#loc125 at #loc149))
|
| 816 |
+
#loc177 = loc(callsite(#loc126 at #loc149))
|
| 817 |
+
#loc178 = loc(callsite(#loc127 at #loc149))
|
| 818 |
+
#loc179 = loc(callsite(#loc150 at #loc149))
|
| 819 |
+
#loc180 = loc(callsite(#loc151 at #loc149))
|
| 820 |
+
#loc181 = loc(callsite(#loc130 at #loc149))
|
| 821 |
+
#loc182 = loc(callsite(#loc131 at #loc149))
|
| 822 |
+
#loc183 = loc(callsite(#loc132 at #loc149))
|
| 823 |
+
#loc184 = loc(callsite(#loc133 at #loc149))
|
| 824 |
+
#loc185 = loc(callsite(#loc134 at #loc149))
|
| 825 |
+
#loc186 = loc(callsite(#loc135 at #loc149))
|
| 826 |
+
#loc187 = loc(callsite(#loc136 at #loc149))
|
| 827 |
+
#loc188 = loc(callsite(#loc137 at #loc149))
|
| 828 |
+
#loc189 = loc(callsite(#loc138 at #loc149))
|
| 829 |
+
#loc190 = loc(callsite(#loc139 at #loc149))
|
| 830 |
+
#loc191 = loc(callsite(#loc140 at #loc149))
|
| 831 |
+
#loc192 = loc(callsite(#loc29 at #loc152))
|
| 832 |
+
#loc193 = loc(callsite(#loc27 at #loc157))
|
| 833 |
+
#loc195 = loc(callsite(#loc27 at #loc161))
|
| 834 |
+
#loc197 = loc(callsite(#loc117 at #loc169))
|
| 835 |
+
#loc198 = loc(callsite(#loc27 at #loc169))
|
| 836 |
+
#loc200 = loc(callsite(#loc117 at #loc174))
|
| 837 |
+
#loc201 = loc(callsite(#loc27 at #loc174))
|
| 838 |
+
#loc203 = loc(callsite(#loc29 at #loc193))
|
| 839 |
+
#loc204 = loc(callsite(#loc29 at #loc195))
|
| 840 |
+
#loc205 = loc(callsite(#loc29 at #loc198))
|
| 841 |
+
#loc206 = loc(callsite(#loc29 at #loc201))
|
SpecForge-ext/cache/compiled_kernels/triton/7/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":41:67)
|
| 4 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
|
| 5 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
|
| 6 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
|
| 7 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
|
| 8 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
|
| 9 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
|
| 10 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:26)
|
| 11 |
+
#loc80 = loc("in_ptr0"(#loc))
|
| 12 |
+
#loc81 = loc("out_ptr2"(#loc))
|
| 13 |
+
#loc82 = loc("out_ptr3"(#loc))
|
| 14 |
+
#loc83 = loc("xnumel"(#loc))
|
| 15 |
+
#loc84 = loc("r0_numel"(#loc))
|
| 16 |
+
#loc106 = loc(callsite(#loc23 at #loc2))
|
| 17 |
+
#loc113 = loc("ileft"(#loc32))
|
| 18 |
+
#loc117 = loc("iright"(#loc37))
|
| 19 |
+
#loc126 = loc("left_idx"(#loc46))
|
| 20 |
+
#loc131 = loc("right_idx"(#loc51))
|
| 21 |
+
#loc150 = loc("tmp11"(#loc70))
|
| 22 |
+
#loc157 = loc(callsite(#loc28 at #loc106))
|
| 23 |
+
#loc161 = loc(callsite(#loc1 at #loc150))
|
| 24 |
+
#loc165 = loc(callsite(#loc113 at #loc157))
|
| 25 |
+
#loc169 = loc(callsite(#loc117 at #loc157))
|
| 26 |
+
#loc177 = loc(callsite(#loc126 at #loc157))
|
| 27 |
+
#loc182 = loc(callsite(#loc131 at #loc157))
|
| 28 |
+
#loc202 = loc(callsite(#loc1 at #loc165))
|
| 29 |
+
#loc204 = loc(callsite(#loc1 at #loc169))
|
| 30 |
+
#loc207 = loc(callsite(#loc1 at #loc177))
|
| 31 |
+
#loc210 = loc(callsite(#loc1 at #loc182))
|
| 32 |
+
module {
|
| 33 |
+
tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 34 |
+
%cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85)
|
| 35 |
+
%cst_0 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1)
|
| 36 |
+
%tmp10 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc86)
|
| 37 |
+
%tmp0 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc87)
|
| 38 |
+
%tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88)
|
| 39 |
+
%cst_2 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1)
|
| 40 |
+
%xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc89)
|
| 41 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 42 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc90)
|
| 43 |
+
%xoffset_3 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc91)
|
| 44 |
+
%xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc92)
|
| 45 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc93)
|
| 46 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<32x1xi32> loc(#loc94)
|
| 47 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<32x1xi32> loc(#loc94)
|
| 48 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<32x1xi32> loc(#loc89)
|
| 49 |
+
%r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95)
|
| 50 |
+
%r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96)
|
| 51 |
+
%x0 = arith.remsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc97)
|
| 52 |
+
%x1 = arith.divsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc98)
|
| 53 |
+
%tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88)
|
| 54 |
+
%tmp0_10 = tt.broadcast %x0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc99)
|
| 55 |
+
%tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc99)
|
| 56 |
+
%tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<32x16xi32> loc(#loc99)
|
| 57 |
+
%tmp0_13 = arith.muli %x1, %tmp0 : tensor<32x1xi32> loc(#loc87)
|
| 58 |
+
%tmp0_14 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc100)
|
| 59 |
+
%tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<32x16xi32> loc(#loc100)
|
| 60 |
+
%tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc101)
|
| 61 |
+
%tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi32> loc(#loc101)
|
| 62 |
+
%tmp0_18 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc102)
|
| 63 |
+
%tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<32x16x!tt.ptr<i32>> loc(#loc102)
|
| 64 |
+
%tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103)
|
| 65 |
+
%tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc104)
|
| 66 |
+
%flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153)
|
| 67 |
+
%flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154)
|
| 68 |
+
%flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154)
|
| 69 |
+
%flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc155)
|
| 70 |
+
%flip_23 = tt.reshape %flip_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc156)
|
| 71 |
+
%y = tt.reshape %tmp0_19 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
|
| 72 |
+
%left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163)
|
| 73 |
+
%ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc164)
|
| 74 |
+
%ileft_24 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc164)
|
| 75 |
+
%ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({
|
| 76 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 77 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 78 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 79 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
|
| 80 |
+
%ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
|
| 81 |
+
%ileft_27 = tt.broadcast %ileft_26 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
|
| 82 |
+
%iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc168)
|
| 83 |
+
%iright_28 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc168)
|
| 84 |
+
%iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({
|
| 85 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 86 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 87 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 88 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
|
| 89 |
+
%iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
|
| 90 |
+
%iright_31 = tt.broadcast %iright_30 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
|
| 91 |
+
%ileft_32 = tt.reshape %ileft_27 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 92 |
+
%iright_33 = tt.reshape %iright_31 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 93 |
+
%y_idx = tt.reshape %tmp4 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc174)
|
| 94 |
+
%left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175)
|
| 95 |
+
%left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc176)
|
| 96 |
+
%left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<256x2x1xi16> loc(#loc176)
|
| 97 |
+
%input = arith.extsi %left_idx_35 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc205)
|
| 98 |
+
%left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 99 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 100 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 101 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 102 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
|
| 103 |
+
%left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
|
| 104 |
+
%left_idx_38 = tt.broadcast %left_idx_37 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
|
| 105 |
+
%right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180)
|
| 106 |
+
%right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc181)
|
| 107 |
+
%right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<256x2x1xi16> loc(#loc181)
|
| 108 |
+
%input_41 = arith.extsi %right_idx_40 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc208)
|
| 109 |
+
%right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({
|
| 110 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 111 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 112 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 113 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
|
| 114 |
+
%right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
|
| 115 |
+
%right_idx_44 = tt.broadcast %right_idx_43 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
|
| 116 |
+
%left_idx_45 = tt.reshape %left_idx_38 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 117 |
+
%right_idx_46 = tt.reshape %right_idx_44 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 118 |
+
%cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc187)
|
| 119 |
+
%eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc188)
|
| 120 |
+
%cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc189)
|
| 121 |
+
%cond_48 = arith.andi %eq, %cond_47 : tensor<32x16xi1> loc(#loc190)
|
| 122 |
+
%cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc191)
|
| 123 |
+
%cond_50 = arith.extui %cond_49 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
|
| 124 |
+
%cond_51 = arith.xori %cond_50, %flip_23 : tensor<32x16xi32> loc(#loc192)
|
| 125 |
+
%cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<32x16xi32> loc(#loc193)
|
| 126 |
+
%ret = arith.xori %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc194)
|
| 127 |
+
%ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 128 |
+
%ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<32x16xi32> loc(#loc196)
|
| 129 |
+
%new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc197)
|
| 130 |
+
%new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 131 |
+
%new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199)
|
| 132 |
+
%new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc199)
|
| 133 |
+
%new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<32x16xi32> loc(#loc199)
|
| 134 |
+
%flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc155)
|
| 135 |
+
%flip_60 = tt.reshape %flip_59 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc156)
|
| 136 |
+
%y_61 = tt.reshape %ret_54 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162)
|
| 137 |
+
%ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc164)
|
| 138 |
+
%ileft_63 = arith.muli %y_61, %ileft_62 : tensor<128x2x2xi32> loc(#loc164)
|
| 139 |
+
%ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({
|
| 140 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 141 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 142 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 143 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201)
|
| 144 |
+
%ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166)
|
| 145 |
+
%ileft_66 = tt.broadcast %ileft_65 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167)
|
| 146 |
+
%iright_67 = arith.muli %y_61, %flip_22 : tensor<128x2x2xi32> loc(#loc168)
|
| 147 |
+
%iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({
|
| 148 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 149 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 150 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 151 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203)
|
| 152 |
+
%iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170)
|
| 153 |
+
%iright_70 = tt.broadcast %iright_69 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171)
|
| 154 |
+
%ileft_71 = tt.reshape %ileft_66 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 155 |
+
%iright_72 = tt.reshape %iright_70 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 156 |
+
%y_idx_73 = tt.reshape %new_idxs_58 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174)
|
| 157 |
+
%left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<128x2x2xi32> loc(#loc176)
|
| 158 |
+
%left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({
|
| 159 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 160 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 161 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 162 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206)
|
| 163 |
+
%left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178)
|
| 164 |
+
%left_idx_77 = tt.broadcast %left_idx_76 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179)
|
| 165 |
+
%right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<128x2x2xi32> loc(#loc181)
|
| 166 |
+
%right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({
|
| 167 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 168 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 169 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 170 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209)
|
| 171 |
+
%right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183)
|
| 172 |
+
%right_idx_81 = tt.broadcast %right_idx_80 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184)
|
| 173 |
+
%left_idx_82 = tt.reshape %left_idx_77 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 174 |
+
%right_idx_83 = tt.reshape %right_idx_81 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 175 |
+
%cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc187)
|
| 176 |
+
%eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc188)
|
| 177 |
+
%cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc189)
|
| 178 |
+
%cond_87 = arith.andi %eq_85, %cond_86 : tensor<32x16xi1> loc(#loc190)
|
| 179 |
+
%cond_88 = arith.ori %cond_84, %cond_87 : tensor<32x16xi1> loc(#loc191)
|
| 180 |
+
%cond_89 = arith.extui %cond_88 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
|
| 181 |
+
%cond_90 = arith.xori %cond_89, %flip_60 : tensor<32x16xi32> loc(#loc192)
|
| 182 |
+
%cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<32x16xi32> loc(#loc193)
|
| 183 |
+
%ret_92 = arith.xori %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc194)
|
| 184 |
+
%ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 185 |
+
%ret_94 = arith.xori %ret_54, %ret_93 : tensor<32x16xi32> loc(#loc196)
|
| 186 |
+
%new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc197)
|
| 187 |
+
%new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 188 |
+
%new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<32x16xi32> loc(#loc199)
|
| 189 |
+
%y_98 = tt.reshape %ret_94 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
|
| 190 |
+
%ileft_99 = arith.muli %y_98, %ileft : tensor<256x2x1xi32> loc(#loc164)
|
| 191 |
+
%ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({
|
| 192 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 193 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 194 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 195 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
|
| 196 |
+
%ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
|
| 197 |
+
%ileft_102 = tt.broadcast %ileft_101 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
|
| 198 |
+
%iright_103 = arith.muli %y_98, %iright : tensor<256x2x1xi32> loc(#loc168)
|
| 199 |
+
%iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({
|
| 200 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 201 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 202 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 203 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
|
| 204 |
+
%iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
|
| 205 |
+
%iright_106 = tt.broadcast %iright_105 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
|
| 206 |
+
%ileft_107 = tt.reshape %ileft_102 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 207 |
+
%iright_108 = tt.reshape %iright_106 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 208 |
+
%y_idx_109 = tt.reshape %new_idxs_97 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174)
|
| 209 |
+
%left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<256x2x1xi32> loc(#loc176)
|
| 210 |
+
%left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({
|
| 211 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 212 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 213 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 214 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
|
| 215 |
+
%left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
|
| 216 |
+
%left_idx_113 = tt.broadcast %left_idx_112 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
|
| 217 |
+
%right_idx_114 = arith.muli %y_idx_109, %iright : tensor<256x2x1xi32> loc(#loc181)
|
| 218 |
+
%right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({
|
| 219 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 220 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 221 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 222 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
|
| 223 |
+
%right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
|
| 224 |
+
%right_idx_117 = tt.broadcast %right_idx_116 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
|
| 225 |
+
%left_idx_118 = tt.reshape %left_idx_113 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 226 |
+
%right_idx_119 = tt.reshape %right_idx_117 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 227 |
+
%cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc187)
|
| 228 |
+
%eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc188)
|
| 229 |
+
%cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc189)
|
| 230 |
+
%cond_123 = arith.andi %eq_121, %cond_122 : tensor<32x16xi1> loc(#loc190)
|
| 231 |
+
%cond_124 = arith.ori %cond_120, %cond_123 : tensor<32x16xi1> loc(#loc191)
|
| 232 |
+
%cond_125 = arith.extui %cond_124 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
|
| 233 |
+
%cond_126 = arith.xori %cond_125, %flip_60 : tensor<32x16xi32> loc(#loc192)
|
| 234 |
+
%cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<32x16xi32> loc(#loc193)
|
| 235 |
+
%ret_128 = arith.xori %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc194)
|
| 236 |
+
%ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 237 |
+
%ret_130 = arith.xori %ret_94, %ret_129 : tensor<32x16xi32> loc(#loc196)
|
| 238 |
+
%new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc197)
|
| 239 |
+
%new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 240 |
+
%new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<32x16xi32> loc(#loc199)
|
| 241 |
+
%flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc155)
|
| 242 |
+
%flip_135 = tt.reshape %flip_134 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc156)
|
| 243 |
+
%y_136 = tt.reshape %ret_130 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162)
|
| 244 |
+
%ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc164)
|
| 245 |
+
%ileft_138 = arith.muli %y_136, %ileft_137 : tensor<64x2x4xi32> loc(#loc164)
|
| 246 |
+
%ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({
|
| 247 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 248 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 249 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 250 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201)
|
| 251 |
+
%ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166)
|
| 252 |
+
%ileft_141 = tt.broadcast %ileft_140 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167)
|
| 253 |
+
%iright_142 = arith.muli %y_136, %flip_59 : tensor<64x2x4xi32> loc(#loc168)
|
| 254 |
+
%iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({
|
| 255 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 256 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 257 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 258 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203)
|
| 259 |
+
%iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170)
|
| 260 |
+
%iright_145 = tt.broadcast %iright_144 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171)
|
| 261 |
+
%ileft_146 = tt.reshape %ileft_141 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 262 |
+
%iright_147 = tt.reshape %iright_145 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 263 |
+
%y_idx_148 = tt.reshape %new_idxs_133 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174)
|
| 264 |
+
%left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<64x2x4xi32> loc(#loc176)
|
| 265 |
+
%left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({
|
| 266 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 267 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 268 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 269 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206)
|
| 270 |
+
%left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178)
|
| 271 |
+
%left_idx_152 = tt.broadcast %left_idx_151 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179)
|
| 272 |
+
%right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<64x2x4xi32> loc(#loc181)
|
| 273 |
+
%right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({
|
| 274 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 275 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 276 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 277 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209)
|
| 278 |
+
%right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183)
|
| 279 |
+
%right_idx_156 = tt.broadcast %right_idx_155 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184)
|
| 280 |
+
%left_idx_157 = tt.reshape %left_idx_152 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 281 |
+
%right_idx_158 = tt.reshape %right_idx_156 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 282 |
+
%cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc187)
|
| 283 |
+
%eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc188)
|
| 284 |
+
%cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc189)
|
| 285 |
+
%cond_162 = arith.andi %eq_160, %cond_161 : tensor<32x16xi1> loc(#loc190)
|
| 286 |
+
%cond_163 = arith.ori %cond_159, %cond_162 : tensor<32x16xi1> loc(#loc191)
|
| 287 |
+
%cond_164 = arith.extui %cond_163 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
|
| 288 |
+
%cond_165 = arith.xori %cond_164, %flip_135 : tensor<32x16xi32> loc(#loc192)
|
| 289 |
+
%cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<32x16xi32> loc(#loc193)
|
| 290 |
+
%ret_167 = arith.xori %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc194)
|
| 291 |
+
%ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 292 |
+
%ret_169 = arith.xori %ret_130, %ret_168 : tensor<32x16xi32> loc(#loc196)
|
| 293 |
+
%new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc197)
|
| 294 |
+
%new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 295 |
+
%new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<32x16xi32> loc(#loc199)
|
| 296 |
+
%y_173 = tt.reshape %ret_169 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162)
|
| 297 |
+
%ileft_174 = arith.muli %y_173, %ileft_62 : tensor<128x2x2xi32> loc(#loc164)
|
| 298 |
+
%ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({
|
| 299 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 300 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 301 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 302 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201)
|
| 303 |
+
%ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166)
|
| 304 |
+
%ileft_177 = tt.broadcast %ileft_176 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167)
|
| 305 |
+
%iright_178 = arith.muli %y_173, %flip_22 : tensor<128x2x2xi32> loc(#loc168)
|
| 306 |
+
%iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({
|
| 307 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 308 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 309 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 310 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203)
|
| 311 |
+
%iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170)
|
| 312 |
+
%iright_181 = tt.broadcast %iright_180 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171)
|
| 313 |
+
%ileft_182 = tt.reshape %ileft_177 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 314 |
+
%iright_183 = tt.reshape %iright_181 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 315 |
+
%y_idx_184 = tt.reshape %new_idxs_172 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174)
|
| 316 |
+
%left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<128x2x2xi32> loc(#loc176)
|
| 317 |
+
%left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({
|
| 318 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 319 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 320 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 321 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206)
|
| 322 |
+
%left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178)
|
| 323 |
+
%left_idx_188 = tt.broadcast %left_idx_187 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179)
|
| 324 |
+
%right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<128x2x2xi32> loc(#loc181)
|
| 325 |
+
%right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({
|
| 326 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 327 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 328 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 329 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209)
|
| 330 |
+
%right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183)
|
| 331 |
+
%right_idx_192 = tt.broadcast %right_idx_191 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184)
|
| 332 |
+
%left_idx_193 = tt.reshape %left_idx_188 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 333 |
+
%right_idx_194 = tt.reshape %right_idx_192 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 334 |
+
%cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc187)
|
| 335 |
+
%eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc188)
|
| 336 |
+
%cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc189)
|
| 337 |
+
%cond_198 = arith.andi %eq_196, %cond_197 : tensor<32x16xi1> loc(#loc190)
|
| 338 |
+
%cond_199 = arith.ori %cond_195, %cond_198 : tensor<32x16xi1> loc(#loc191)
|
| 339 |
+
%cond_200 = arith.extui %cond_199 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
|
| 340 |
+
%cond_201 = arith.xori %cond_200, %flip_135 : tensor<32x16xi32> loc(#loc192)
|
| 341 |
+
%cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<32x16xi32> loc(#loc193)
|
| 342 |
+
%ret_203 = arith.xori %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc194)
|
| 343 |
+
%ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 344 |
+
%ret_205 = arith.xori %ret_169, %ret_204 : tensor<32x16xi32> loc(#loc196)
|
| 345 |
+
%new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc197)
|
| 346 |
+
%new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 347 |
+
%new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<32x16xi32> loc(#loc199)
|
| 348 |
+
%y_209 = tt.reshape %ret_205 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
|
| 349 |
+
%ileft_210 = arith.muli %y_209, %ileft : tensor<256x2x1xi32> loc(#loc164)
|
| 350 |
+
%ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({
|
| 351 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 352 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 353 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 354 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
|
| 355 |
+
%ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
|
| 356 |
+
%ileft_213 = tt.broadcast %ileft_212 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
|
| 357 |
+
%iright_214 = arith.muli %y_209, %iright : tensor<256x2x1xi32> loc(#loc168)
|
| 358 |
+
%iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({
|
| 359 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 360 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 361 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 362 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
|
| 363 |
+
%iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
|
| 364 |
+
%iright_217 = tt.broadcast %iright_216 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
|
| 365 |
+
%ileft_218 = tt.reshape %ileft_213 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 366 |
+
%iright_219 = tt.reshape %iright_217 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 367 |
+
%y_idx_220 = tt.reshape %new_idxs_208 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174)
|
| 368 |
+
%left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<256x2x1xi32> loc(#loc176)
|
| 369 |
+
%left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({
|
| 370 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 371 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 372 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 373 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
|
| 374 |
+
%left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
|
| 375 |
+
%left_idx_224 = tt.broadcast %left_idx_223 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
|
| 376 |
+
%right_idx_225 = arith.muli %y_idx_220, %iright : tensor<256x2x1xi32> loc(#loc181)
|
| 377 |
+
%right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({
|
| 378 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 379 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 380 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 381 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
|
| 382 |
+
%right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
|
| 383 |
+
%right_idx_228 = tt.broadcast %right_idx_227 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
|
| 384 |
+
%left_idx_229 = tt.reshape %left_idx_224 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 385 |
+
%right_idx_230 = tt.reshape %right_idx_228 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 386 |
+
%cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc187)
|
| 387 |
+
%eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc188)
|
| 388 |
+
%cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc189)
|
| 389 |
+
%cond_234 = arith.andi %eq_232, %cond_233 : tensor<32x16xi1> loc(#loc190)
|
| 390 |
+
%cond_235 = arith.ori %cond_231, %cond_234 : tensor<32x16xi1> loc(#loc191)
|
| 391 |
+
%cond_236 = arith.extui %cond_235 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
|
| 392 |
+
%cond_237 = arith.xori %cond_236, %flip_135 : tensor<32x16xi32> loc(#loc192)
|
| 393 |
+
%cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<32x16xi32> loc(#loc193)
|
| 394 |
+
%ret_239 = arith.xori %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc194)
|
| 395 |
+
%ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 396 |
+
%ret_241 = arith.xori %ret_205, %ret_240 : tensor<32x16xi32> loc(#loc196)
|
| 397 |
+
%new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc197)
|
| 398 |
+
%new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 399 |
+
%new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<32x16xi32> loc(#loc199)
|
| 400 |
+
%y_245 = tt.reshape %ret_241 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc162)
|
| 401 |
+
%ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc164)
|
| 402 |
+
%ileft_247 = arith.muli %y_245, %ileft_246 : tensor<32x2x8xi32> loc(#loc164)
|
| 403 |
+
%ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({
|
| 404 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 405 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 406 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 407 |
+
}) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc201)
|
| 408 |
+
%ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc166)
|
| 409 |
+
%ileft_250 = tt.broadcast %ileft_249 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc167)
|
| 410 |
+
%iright_251 = arith.muli %y_245, %flip_134 : tensor<32x2x8xi32> loc(#loc168)
|
| 411 |
+
%iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({
|
| 412 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 413 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 414 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 415 |
+
}) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc203)
|
| 416 |
+
%iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc170)
|
| 417 |
+
%iright_254 = tt.broadcast %iright_253 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc171)
|
| 418 |
+
%ileft_255 = tt.reshape %ileft_250 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 419 |
+
%iright_256 = tt.reshape %iright_254 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 420 |
+
%y_idx_257 = tt.reshape %new_idxs_244 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc174)
|
| 421 |
+
%left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<32x2x8xi32> loc(#loc176)
|
| 422 |
+
%left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({
|
| 423 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 424 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 425 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 426 |
+
}) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc206)
|
| 427 |
+
%left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc178)
|
| 428 |
+
%left_idx_261 = tt.broadcast %left_idx_260 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc179)
|
| 429 |
+
%right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<32x2x8xi32> loc(#loc181)
|
| 430 |
+
%right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({
|
| 431 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 432 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 433 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 434 |
+
}) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc209)
|
| 435 |
+
%right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc183)
|
| 436 |
+
%right_idx_265 = tt.broadcast %right_idx_264 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc184)
|
| 437 |
+
%left_idx_266 = tt.reshape %left_idx_261 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 438 |
+
%right_idx_267 = tt.reshape %right_idx_265 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 439 |
+
%cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc187)
|
| 440 |
+
%eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc188)
|
| 441 |
+
%cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc189)
|
| 442 |
+
%cond_271 = arith.andi %eq_269, %cond_270 : tensor<32x16xi1> loc(#loc190)
|
| 443 |
+
%cond_272 = arith.ori %cond_268, %cond_271 : tensor<32x16xi1> loc(#loc191)
|
| 444 |
+
%ret_273 = arith.xori %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc194)
|
| 445 |
+
%ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 446 |
+
%ret_275 = arith.xori %ret_241, %ret_274 : tensor<32x16xi32> loc(#loc196)
|
| 447 |
+
%new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc197)
|
| 448 |
+
%new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 449 |
+
%new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<32x16xi32> loc(#loc199)
|
| 450 |
+
%y_279 = tt.reshape %ret_275 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162)
|
| 451 |
+
%ileft_280 = arith.muli %y_279, %ileft_137 : tensor<64x2x4xi32> loc(#loc164)
|
| 452 |
+
%ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({
|
| 453 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 454 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 455 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 456 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201)
|
| 457 |
+
%ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166)
|
| 458 |
+
%ileft_283 = tt.broadcast %ileft_282 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167)
|
| 459 |
+
%iright_284 = arith.muli %y_279, %flip_59 : tensor<64x2x4xi32> loc(#loc168)
|
| 460 |
+
%iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({
|
| 461 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 462 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 463 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 464 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203)
|
| 465 |
+
%iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170)
|
| 466 |
+
%iright_287 = tt.broadcast %iright_286 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171)
|
| 467 |
+
%ileft_288 = tt.reshape %ileft_283 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 468 |
+
%iright_289 = tt.reshape %iright_287 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 469 |
+
%y_idx_290 = tt.reshape %new_idxs_278 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174)
|
| 470 |
+
%left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<64x2x4xi32> loc(#loc176)
|
| 471 |
+
%left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({
|
| 472 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 473 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 474 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 475 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206)
|
| 476 |
+
%left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178)
|
| 477 |
+
%left_idx_294 = tt.broadcast %left_idx_293 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179)
|
| 478 |
+
%right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<64x2x4xi32> loc(#loc181)
|
| 479 |
+
%right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({
|
| 480 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 481 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 482 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 483 |
+
}) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209)
|
| 484 |
+
%right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183)
|
| 485 |
+
%right_idx_298 = tt.broadcast %right_idx_297 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184)
|
| 486 |
+
%left_idx_299 = tt.reshape %left_idx_294 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 487 |
+
%right_idx_300 = tt.reshape %right_idx_298 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 488 |
+
%cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc187)
|
| 489 |
+
%eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc188)
|
| 490 |
+
%cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc189)
|
| 491 |
+
%cond_304 = arith.andi %eq_302, %cond_303 : tensor<32x16xi1> loc(#loc190)
|
| 492 |
+
%cond_305 = arith.ori %cond_301, %cond_304 : tensor<32x16xi1> loc(#loc191)
|
| 493 |
+
%ret_306 = arith.xori %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc194)
|
| 494 |
+
%ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 495 |
+
%ret_308 = arith.xori %ret_275, %ret_307 : tensor<32x16xi32> loc(#loc196)
|
| 496 |
+
%new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc197)
|
| 497 |
+
%new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 498 |
+
%new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<32x16xi32> loc(#loc199)
|
| 499 |
+
%y_312 = tt.reshape %ret_308 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162)
|
| 500 |
+
%ileft_313 = arith.muli %y_312, %ileft_62 : tensor<128x2x2xi32> loc(#loc164)
|
| 501 |
+
%ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({
|
| 502 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 503 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 504 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 505 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201)
|
| 506 |
+
%ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166)
|
| 507 |
+
%ileft_316 = tt.broadcast %ileft_315 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167)
|
| 508 |
+
%iright_317 = arith.muli %y_312, %flip_22 : tensor<128x2x2xi32> loc(#loc168)
|
| 509 |
+
%iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({
|
| 510 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 511 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 512 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 513 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203)
|
| 514 |
+
%iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170)
|
| 515 |
+
%iright_320 = tt.broadcast %iright_319 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171)
|
| 516 |
+
%ileft_321 = tt.reshape %ileft_316 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 517 |
+
%iright_322 = tt.reshape %iright_320 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 518 |
+
%y_idx_323 = tt.reshape %new_idxs_311 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174)
|
| 519 |
+
%left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<128x2x2xi32> loc(#loc176)
|
| 520 |
+
%left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({
|
| 521 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 522 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 523 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 524 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206)
|
| 525 |
+
%left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178)
|
| 526 |
+
%left_idx_327 = tt.broadcast %left_idx_326 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179)
|
| 527 |
+
%right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<128x2x2xi32> loc(#loc181)
|
| 528 |
+
%right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({
|
| 529 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 530 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 531 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 532 |
+
}) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209)
|
| 533 |
+
%right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183)
|
| 534 |
+
%right_idx_331 = tt.broadcast %right_idx_330 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184)
|
| 535 |
+
%left_idx_332 = tt.reshape %left_idx_327 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 536 |
+
%right_idx_333 = tt.reshape %right_idx_331 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 537 |
+
%cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc187)
|
| 538 |
+
%eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc188)
|
| 539 |
+
%cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc189)
|
| 540 |
+
%cond_337 = arith.andi %eq_335, %cond_336 : tensor<32x16xi1> loc(#loc190)
|
| 541 |
+
%cond_338 = arith.ori %cond_334, %cond_337 : tensor<32x16xi1> loc(#loc191)
|
| 542 |
+
%ret_339 = arith.xori %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc194)
|
| 543 |
+
%ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
|
| 544 |
+
%ret_341 = arith.xori %ret_308, %ret_340 : tensor<32x16xi32> loc(#loc196)
|
| 545 |
+
%new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc197)
|
| 546 |
+
%new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 547 |
+
%new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<32x16xi32> loc(#loc199)
|
| 548 |
+
%y_345 = tt.reshape %ret_341 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
|
| 549 |
+
%ileft_346 = arith.muli %y_345, %ileft : tensor<256x2x1xi32> loc(#loc164)
|
| 550 |
+
%ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({
|
| 551 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 552 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 553 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 554 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
|
| 555 |
+
%ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
|
| 556 |
+
%ileft_349 = tt.broadcast %ileft_348 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
|
| 557 |
+
%iright_350 = arith.muli %y_345, %iright : tensor<256x2x1xi32> loc(#loc168)
|
| 558 |
+
%iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({
|
| 559 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 560 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 561 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 562 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
|
| 563 |
+
%iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
|
| 564 |
+
%iright_353 = tt.broadcast %iright_352 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
|
| 565 |
+
%ileft_354 = tt.reshape %ileft_349 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
|
| 566 |
+
%iright_355 = tt.reshape %iright_353 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
|
| 567 |
+
%y_idx_356 = tt.reshape %new_idxs_344 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174)
|
| 568 |
+
%left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<256x2x1xi32> loc(#loc176)
|
| 569 |
+
%left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({
|
| 570 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 571 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 572 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 573 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
|
| 574 |
+
%left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
|
| 575 |
+
%left_idx_360 = tt.broadcast %left_idx_359 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
|
| 576 |
+
%right_idx_361 = arith.muli %y_idx_356, %iright : tensor<256x2x1xi32> loc(#loc181)
|
| 577 |
+
%right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({
|
| 578 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 579 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 580 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 581 |
+
}) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
|
| 582 |
+
%right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
|
| 583 |
+
%right_idx_364 = tt.broadcast %right_idx_363 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
|
| 584 |
+
%left_idx_365 = tt.reshape %left_idx_360 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
|
| 585 |
+
%right_idx_366 = tt.reshape %right_idx_364 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
|
| 586 |
+
%cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc187)
|
| 587 |
+
%eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc188)
|
| 588 |
+
%cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc189)
|
| 589 |
+
%cond_370 = arith.andi %eq_368, %cond_369 : tensor<32x16xi1> loc(#loc190)
|
| 590 |
+
%cond_371 = arith.ori %cond_367, %cond_370 : tensor<32x16xi1> loc(#loc191)
|
| 591 |
+
%new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc197)
|
| 592 |
+
%new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
|
| 593 |
+
%new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<32x16xi32> loc(#loc199)
|
| 594 |
+
%tmp7 = arith.extsi %tmp0_19 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc149)
|
| 595 |
+
%tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc86)
|
| 596 |
+
%tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({
|
| 597 |
+
^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))):
|
| 598 |
+
%tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200)
|
| 599 |
+
tt.reduce.return %tmp11_379 : i64 loc(#loc160)
|
| 600 |
+
}) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc160)
|
| 601 |
+
%tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc151)
|
| 602 |
+
%tmp14 = arith.trunci %tmp11_376 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc152)
|
| 603 |
+
%0 = arith.muli %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc73)
|
| 604 |
+
%1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc74)
|
| 605 |
+
%2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc74)
|
| 606 |
+
%3 = arith.addi %1, %2 : tensor<32x16xi32> loc(#loc74)
|
| 607 |
+
%4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc75)
|
| 608 |
+
%5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi32> loc(#loc75)
|
| 609 |
+
tt.store %5, %new_idxs_374, %tmp0_18 : tensor<32x16x!tt.ptr<i32>> loc(#loc76)
|
| 610 |
+
%6 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc77)
|
| 611 |
+
%7 = tt.addptr %6, %xindex_6 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi32> loc(#loc77)
|
| 612 |
+
tt.store %7, %tmp14, %xmask_7 : tensor<32x1x!tt.ptr<i32>> loc(#loc78)
|
| 613 |
+
tt.return loc(#loc79)
|
| 614 |
+
} loc(#loc)
|
| 615 |
+
} loc(#loc)
|
| 616 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":44:34)
|
| 617 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:49)
|
| 618 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:38)
|
| 619 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":26:21)
|
| 620 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:28)
|
| 621 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":24:33)
|
| 622 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:36)
|
| 623 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:44)
|
| 624 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":25:23)
|
| 625 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:28)
|
| 626 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":27:38)
|
| 627 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":33:19)
|
| 628 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":34:19)
|
| 629 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:35)
|
| 630 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:45)
|
| 631 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:30)
|
| 632 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":36:54)
|
| 633 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":38:19)
|
| 634 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":40:33)
|
| 635 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41)
|
| 636 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
|
| 637 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
|
| 638 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
|
| 639 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
|
| 640 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
|
| 641 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
|
| 642 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 643 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 644 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
|
| 645 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
|
| 646 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
|
| 647 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
|
| 648 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
|
| 649 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
|
| 650 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
|
| 651 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
|
| 652 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
|
| 653 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
|
| 654 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
|
| 655 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
|
| 656 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
|
| 657 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
|
| 658 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
|
| 659 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
|
| 660 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
|
| 661 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
|
| 662 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
|
| 663 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
|
| 664 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
|
| 665 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
|
| 666 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
|
| 667 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
|
| 668 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
|
| 669 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
|
| 670 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
|
| 671 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
|
| 672 |
+
#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
|
| 673 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
|
| 674 |
+
#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
|
| 675 |
+
#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
|
| 676 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":42:19)
|
| 677 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":45:29)
|
| 678 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":48:21)
|
| 679 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:35)
|
| 680 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:32)
|
| 681 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:25)
|
| 682 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":49:47)
|
| 683 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:25)
|
| 684 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:37)
|
| 685 |
+
#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pe/cpecz443wnneogc65oicauauoytwy7k6ryeyv24laczmux6pdi2b.py":50:4)
|
| 686 |
+
#loc85 = loc(callsite(#loc1 at #loc2))
|
| 687 |
+
#loc86 = loc("tmp10"(#loc3))
|
| 688 |
+
#loc87 = loc("tmp0"(#loc4))
|
| 689 |
+
#loc88 = loc("tmp0"(#loc5))
|
| 690 |
+
#loc89 = loc("xmask"(#loc6))
|
| 691 |
+
#loc90 = loc("xoffset"(#loc7))
|
| 692 |
+
#loc91 = loc("xoffset"(#loc8))
|
| 693 |
+
#loc92 = loc("xindex"(#loc9))
|
| 694 |
+
#loc93 = loc("xindex"(#loc10))
|
| 695 |
+
#loc94 = loc("xindex"(#loc11))
|
| 696 |
+
#loc95 = loc("r0_index"(#loc12))
|
| 697 |
+
#loc96 = loc("r0_index"(#loc13))
|
| 698 |
+
#loc97 = loc("x0"(#loc14))
|
| 699 |
+
#loc98 = loc("x1"(#loc15))
|
| 700 |
+
#loc99 = loc("tmp0"(#loc16))
|
| 701 |
+
#loc100 = loc("tmp0"(#loc17))
|
| 702 |
+
#loc101 = loc("tmp0"(#loc18))
|
| 703 |
+
#loc102 = loc("tmp0"(#loc19))
|
| 704 |
+
#loc103 = loc("tmp2"(#loc20))
|
| 705 |
+
#loc104 = loc("tmp4"(#loc21))
|
| 706 |
+
#loc105 = loc("flip"(#loc22))
|
| 707 |
+
#loc107 = loc("flip"(#loc24))
|
| 708 |
+
#loc108 = loc("flip"(#loc25))
|
| 709 |
+
#loc109 = loc("flip"(#loc26))
|
| 710 |
+
#loc110 = loc("y"(#loc27))
|
| 711 |
+
#loc111 = loc("left_mask"(#loc29))
|
| 712 |
+
#loc112 = loc("ileft"(#loc30))
|
| 713 |
+
#loc114 = loc("ileft"(#loc34))
|
| 714 |
+
#loc115 = loc("ileft"(#loc35))
|
| 715 |
+
#loc116 = loc("iright"(#loc36))
|
| 716 |
+
#loc118 = loc("iright"(#loc38))
|
| 717 |
+
#loc119 = loc("iright"(#loc39))
|
| 718 |
+
#loc120 = loc("ileft"(#loc40))
|
| 719 |
+
#loc121 = loc("iright"(#loc41))
|
| 720 |
+
#loc122 = loc("y_idx"(#loc42))
|
| 721 |
+
#loc123 = loc("left_idx"(#loc43))
|
| 722 |
+
#loc124 = loc("left_idx"(#loc44))
|
| 723 |
+
#loc125 = loc("input"(#loc45))
|
| 724 |
+
#loc127 = loc("left_idx"(#loc47))
|
| 725 |
+
#loc128 = loc("left_idx"(#loc48))
|
| 726 |
+
#loc129 = loc("right_idx"(#loc49))
|
| 727 |
+
#loc130 = loc("right_idx"(#loc50))
|
| 728 |
+
#loc132 = loc("right_idx"(#loc52))
|
| 729 |
+
#loc133 = loc("right_idx"(#loc53))
|
| 730 |
+
#loc134 = loc("left_idx"(#loc54))
|
| 731 |
+
#loc135 = loc("right_idx"(#loc55))
|
| 732 |
+
#loc136 = loc("cond"(#loc56))
|
| 733 |
+
#loc137 = loc("eq"(#loc57))
|
| 734 |
+
#loc138 = loc("cond"(#loc58))
|
| 735 |
+
#loc139 = loc("cond"(#loc59))
|
| 736 |
+
#loc140 = loc("cond"(#loc60))
|
| 737 |
+
#loc141 = loc("cond"(#loc61))
|
| 738 |
+
#loc142 = loc("cond"(#loc62))
|
| 739 |
+
#loc143 = loc("ret"(#loc63))
|
| 740 |
+
#loc144 = loc("ret"(#loc64))
|
| 741 |
+
#loc145 = loc("ret"(#loc65))
|
| 742 |
+
#loc146 = loc("new_idxs"(#loc66))
|
| 743 |
+
#loc147 = loc("new_idxs"(#loc67))
|
| 744 |
+
#loc148 = loc("new_idxs"(#loc68))
|
| 745 |
+
#loc149 = loc("tmp7"(#loc69))
|
| 746 |
+
#loc151 = loc("tmp11"(#loc71))
|
| 747 |
+
#loc152 = loc("tmp14"(#loc72))
|
| 748 |
+
#loc153 = loc(callsite(#loc105 at #loc106))
|
| 749 |
+
#loc154 = loc(callsite(#loc107 at #loc106))
|
| 750 |
+
#loc155 = loc(callsite(#loc108 at #loc106))
|
| 751 |
+
#loc156 = loc(callsite(#loc109 at #loc106))
|
| 752 |
+
#loc158 = loc("cond"(#loc136))
|
| 753 |
+
#loc159 = loc("eq"(#loc137))
|
| 754 |
+
#loc160 = loc(callsite(#loc31 at #loc150))
|
| 755 |
+
#loc162 = loc(callsite(#loc110 at #loc157))
|
| 756 |
+
#loc163 = loc(callsite(#loc111 at #loc157))
|
| 757 |
+
#loc164 = loc(callsite(#loc112 at #loc157))
|
| 758 |
+
#loc166 = loc(callsite(#loc114 at #loc157))
|
| 759 |
+
#loc167 = loc(callsite(#loc115 at #loc157))
|
| 760 |
+
#loc168 = loc(callsite(#loc116 at #loc157))
|
| 761 |
+
#loc170 = loc(callsite(#loc118 at #loc157))
|
| 762 |
+
#loc171 = loc(callsite(#loc119 at #loc157))
|
| 763 |
+
#loc172 = loc(callsite(#loc120 at #loc157))
|
| 764 |
+
#loc173 = loc(callsite(#loc121 at #loc157))
|
| 765 |
+
#loc174 = loc(callsite(#loc122 at #loc157))
|
| 766 |
+
#loc175 = loc(callsite(#loc123 at #loc157))
|
| 767 |
+
#loc176 = loc(callsite(#loc124 at #loc157))
|
| 768 |
+
#loc178 = loc(callsite(#loc127 at #loc157))
|
| 769 |
+
#loc179 = loc(callsite(#loc128 at #loc157))
|
| 770 |
+
#loc180 = loc(callsite(#loc129 at #loc157))
|
| 771 |
+
#loc181 = loc(callsite(#loc130 at #loc157))
|
| 772 |
+
#loc183 = loc(callsite(#loc132 at #loc157))
|
| 773 |
+
#loc184 = loc(callsite(#loc133 at #loc157))
|
| 774 |
+
#loc185 = loc(callsite(#loc134 at #loc157))
|
| 775 |
+
#loc186 = loc(callsite(#loc135 at #loc157))
|
| 776 |
+
#loc187 = loc(callsite(#loc158 at #loc157))
|
| 777 |
+
#loc188 = loc(callsite(#loc159 at #loc157))
|
| 778 |
+
#loc189 = loc(callsite(#loc138 at #loc157))
|
| 779 |
+
#loc190 = loc(callsite(#loc139 at #loc157))
|
| 780 |
+
#loc191 = loc(callsite(#loc140 at #loc157))
|
| 781 |
+
#loc192 = loc(callsite(#loc141 at #loc157))
|
| 782 |
+
#loc193 = loc(callsite(#loc142 at #loc157))
|
| 783 |
+
#loc194 = loc(callsite(#loc143 at #loc157))
|
| 784 |
+
#loc195 = loc(callsite(#loc144 at #loc157))
|
| 785 |
+
#loc196 = loc(callsite(#loc145 at #loc157))
|
| 786 |
+
#loc197 = loc(callsite(#loc146 at #loc157))
|
| 787 |
+
#loc198 = loc(callsite(#loc147 at #loc157))
|
| 788 |
+
#loc199 = loc(callsite(#loc148 at #loc157))
|
| 789 |
+
#loc200 = loc(callsite(#loc33 at #loc160))
|
| 790 |
+
#loc201 = loc(callsite(#loc31 at #loc165))
|
| 791 |
+
#loc203 = loc(callsite(#loc31 at #loc169))
|
| 792 |
+
#loc205 = loc(callsite(#loc125 at #loc177))
|
| 793 |
+
#loc206 = loc(callsite(#loc31 at #loc177))
|
| 794 |
+
#loc208 = loc(callsite(#loc125 at #loc182))
|
| 795 |
+
#loc209 = loc(callsite(#loc31 at #loc182))
|
| 796 |
+
#loc211 = loc(callsite(#loc33 at #loc201))
|
| 797 |
+
#loc212 = loc(callsite(#loc33 at #loc203))
|
| 798 |
+
#loc213 = loc(callsite(#loc33 at #loc206))
|
| 799 |
+
#loc214 = loc(callsite(#loc33 at #loc209))
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/__grp__triton_red_fused_argmax_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.cubin
ADDED
|
Binary file (72.6 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "0e4cc76a2ea35b5a309428a12917c533732d30887808410cf8726b5bed41ff40", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"}
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.llir
ADDED
|
@@ -0,0 +1,1166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%10 = shl i32 %9, 6, !dbg !8
|
| 11 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%12 = and i32 %11, 448, !dbg !9
|
| 13 |
+
%13 = and i32 %11, 63, !dbg !9
|
| 14 |
+
%14 = lshr exact i32 %12, 6, !dbg !9
|
| 15 |
+
%15 = or disjoint i32 %14, 8, !dbg !9
|
| 16 |
+
%16 = or disjoint i32 %14, 16, !dbg !9
|
| 17 |
+
%17 = or disjoint i32 %14, 24, !dbg !9
|
| 18 |
+
%18 = insertelement <4 x i32> poison, i32 %14, i64 0, !dbg !9
|
| 19 |
+
%19 = shufflevector <4 x i32> %18, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !9
|
| 20 |
+
%20 = or disjoint <4 x i32> %19, <i32 56, i32 48, i32 40, i32 32>, !dbg !9
|
| 21 |
+
%21 = insertelement <8 x i32> poison, i32 %17, i64 4, !dbg !10
|
| 22 |
+
%22 = insertelement <8 x i32> %21, i32 %16, i64 5, !dbg !10
|
| 23 |
+
%23 = insertelement <8 x i32> %22, i32 %15, i64 6, !dbg !10
|
| 24 |
+
%24 = insertelement <8 x i32> %23, i32 %14, i64 7, !dbg !10
|
| 25 |
+
%25 = shufflevector <4 x i32> %20, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !10
|
| 26 |
+
%26 = shufflevector <8 x i32> %25, <8 x i32> %24, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>, !dbg !10
|
| 27 |
+
%27 = insertelement <8 x i32> poison, i32 %10, i64 0, !dbg !10
|
| 28 |
+
%28 = shufflevector <8 x i32> %27, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !10
|
| 29 |
+
%29 = or disjoint <8 x i32> %26, %28, !dbg !10
|
| 30 |
+
%30 = insertelement <8 x i32> poison, i32 %4, i64 0, !dbg !11
|
| 31 |
+
%31 = shufflevector <8 x i32> %30, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !11
|
| 32 |
+
%32 = icmp slt <8 x i32> %29, %31, !dbg !11
|
| 33 |
+
%33 = extractelement <8 x i32> %29, i64 7, !dbg !12
|
| 34 |
+
%34 = sext i32 %33 to i64, !dbg !12
|
| 35 |
+
%35 = extractelement <8 x i32> %29, i64 6, !dbg !12
|
| 36 |
+
%36 = sext i32 %35 to i64, !dbg !12
|
| 37 |
+
%37 = extractelement <8 x i32> %29, i64 5, !dbg !12
|
| 38 |
+
%38 = sext i32 %37 to i64, !dbg !12
|
| 39 |
+
%39 = extractelement <8 x i32> %29, i64 4, !dbg !12
|
| 40 |
+
%40 = sext i32 %39 to i64, !dbg !12
|
| 41 |
+
%41 = extractelement <8 x i32> %29, i64 3, !dbg !12
|
| 42 |
+
%42 = sext i32 %41 to i64, !dbg !12
|
| 43 |
+
%43 = extractelement <8 x i32> %29, i64 2, !dbg !12
|
| 44 |
+
%44 = sext i32 %43 to i64, !dbg !12
|
| 45 |
+
%45 = extractelement <8 x i32> %29, i64 1, !dbg !12
|
| 46 |
+
%46 = sext i32 %45 to i64, !dbg !12
|
| 47 |
+
%47 = extractelement <8 x i32> %29, i64 0, !dbg !12
|
| 48 |
+
%48 = sext i32 %47 to i64, !dbg !12
|
| 49 |
+
%.frozen = freeze i64 %34, !dbg !13
|
| 50 |
+
%.frozen70 = freeze i64 %2, !dbg !13
|
| 51 |
+
%49 = sdiv i64 %.frozen, %.frozen70, !dbg !13
|
| 52 |
+
%50 = mul i64 %49, %.frozen70, !dbg !12
|
| 53 |
+
%.decomposed = sub i64 %.frozen, %50, !dbg !12
|
| 54 |
+
%.frozen71 = freeze i64 %36, !dbg !13
|
| 55 |
+
%.frozen72 = freeze i64 %2, !dbg !13
|
| 56 |
+
%51 = sdiv i64 %.frozen71, %.frozen72, !dbg !13
|
| 57 |
+
%52 = mul i64 %51, %.frozen72, !dbg !12
|
| 58 |
+
%.decomposed73 = sub i64 %.frozen71, %52, !dbg !12
|
| 59 |
+
%.frozen74 = freeze i64 %38, !dbg !13
|
| 60 |
+
%.frozen75 = freeze i64 %2, !dbg !13
|
| 61 |
+
%53 = sdiv i64 %.frozen74, %.frozen75, !dbg !13
|
| 62 |
+
%54 = mul i64 %53, %.frozen75, !dbg !12
|
| 63 |
+
%.decomposed76 = sub i64 %.frozen74, %54, !dbg !12
|
| 64 |
+
%.frozen77 = freeze i64 %40, !dbg !13
|
| 65 |
+
%.frozen78 = freeze i64 %2, !dbg !13
|
| 66 |
+
%55 = sdiv i64 %.frozen77, %.frozen78, !dbg !13
|
| 67 |
+
%56 = mul i64 %55, %.frozen78, !dbg !12
|
| 68 |
+
%.decomposed79 = sub i64 %.frozen77, %56, !dbg !12
|
| 69 |
+
%.frozen80 = freeze i64 %42, !dbg !13
|
| 70 |
+
%.frozen81 = freeze i64 %2, !dbg !13
|
| 71 |
+
%57 = sdiv i64 %.frozen80, %.frozen81, !dbg !13
|
| 72 |
+
%58 = mul i64 %57, %.frozen81, !dbg !12
|
| 73 |
+
%.decomposed82 = sub i64 %.frozen80, %58, !dbg !12
|
| 74 |
+
%.frozen83 = freeze i64 %44, !dbg !13
|
| 75 |
+
%.frozen84 = freeze i64 %2, !dbg !13
|
| 76 |
+
%59 = sdiv i64 %.frozen83, %.frozen84, !dbg !13
|
| 77 |
+
%60 = mul i64 %59, %.frozen84, !dbg !12
|
| 78 |
+
%.decomposed85 = sub i64 %.frozen83, %60, !dbg !12
|
| 79 |
+
%.frozen86 = freeze i64 %46, !dbg !13
|
| 80 |
+
%.frozen87 = freeze i64 %2, !dbg !13
|
| 81 |
+
%61 = sdiv i64 %.frozen86, %.frozen87, !dbg !13
|
| 82 |
+
%62 = mul i64 %61, %.frozen87, !dbg !12
|
| 83 |
+
%.decomposed88 = sub i64 %.frozen86, %62, !dbg !12
|
| 84 |
+
%.frozen89 = freeze i64 %48, !dbg !13
|
| 85 |
+
%.frozen90 = freeze i64 %2, !dbg !13
|
| 86 |
+
%63 = sdiv i64 %.frozen89, %.frozen90, !dbg !13
|
| 87 |
+
%64 = mul i64 %63, %.frozen90, !dbg !12
|
| 88 |
+
%.decomposed91 = sub i64 %.frozen89, %64, !dbg !12
|
| 89 |
+
%65 = mul i64 %49, %3, !dbg !14
|
| 90 |
+
%66 = mul i64 %51, %3, !dbg !14
|
| 91 |
+
%67 = mul i64 %53, %3, !dbg !14
|
| 92 |
+
%68 = mul i64 %55, %3, !dbg !14
|
| 93 |
+
%69 = mul i64 %57, %3, !dbg !14
|
| 94 |
+
%70 = mul i64 %59, %3, !dbg !14
|
| 95 |
+
%71 = mul i64 %61, %3, !dbg !14
|
| 96 |
+
%72 = mul i64 %63, %3, !dbg !14
|
| 97 |
+
%.idx = mul nsw i64 %.decomposed, 128000
|
| 98 |
+
%73 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx
|
| 99 |
+
%invariant.gep = getelementptr float, ptr addrspace(1) %73, i64 %65, !dbg !15
|
| 100 |
+
%.idx1 = mul nsw i64 %.decomposed73, 128000
|
| 101 |
+
%74 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx1
|
| 102 |
+
%invariant.gep9 = getelementptr float, ptr addrspace(1) %74, i64 %66, !dbg !15
|
| 103 |
+
%.idx2 = mul nsw i64 %.decomposed76, 128000
|
| 104 |
+
%75 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx2
|
| 105 |
+
%invariant.gep11 = getelementptr float, ptr addrspace(1) %75, i64 %67, !dbg !15
|
| 106 |
+
%.idx3 = mul nsw i64 %.decomposed79, 128000
|
| 107 |
+
%76 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx3
|
| 108 |
+
%invariant.gep13 = getelementptr float, ptr addrspace(1) %76, i64 %68, !dbg !15
|
| 109 |
+
%.idx4 = mul nsw i64 %.decomposed82, 128000
|
| 110 |
+
%77 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx4
|
| 111 |
+
%invariant.gep15 = getelementptr float, ptr addrspace(1) %77, i64 %69, !dbg !15
|
| 112 |
+
%.idx5 = mul nsw i64 %.decomposed85, 128000
|
| 113 |
+
%78 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx5
|
| 114 |
+
%invariant.gep17 = getelementptr float, ptr addrspace(1) %78, i64 %70, !dbg !15
|
| 115 |
+
%.idx6 = mul nsw i64 %.decomposed88, 128000
|
| 116 |
+
%79 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx6
|
| 117 |
+
%invariant.gep19 = getelementptr float, ptr addrspace(1) %79, i64 %71, !dbg !15
|
| 118 |
+
%.idx7 = mul nsw i64 %.decomposed91, 128000
|
| 119 |
+
%80 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx7
|
| 120 |
+
%invariant.gep21 = getelementptr float, ptr addrspace(1) %80, i64 %72, !dbg !15
|
| 121 |
+
%81 = zext nneg i32 %13 to i64, !dbg !15
|
| 122 |
+
%82 = extractelement <8 x i1> %32, i64 0, !dbg !16
|
| 123 |
+
%83 = extractelement <8 x i1> %32, i64 1, !dbg !16
|
| 124 |
+
%84 = extractelement <8 x i1> %32, i64 2, !dbg !16
|
| 125 |
+
%85 = extractelement <8 x i1> %32, i64 3, !dbg !16
|
| 126 |
+
%86 = extractelement <8 x i1> %32, i64 4, !dbg !16
|
| 127 |
+
%87 = extractelement <8 x i1> %32, i64 5, !dbg !16
|
| 128 |
+
%88 = extractelement <8 x i1> %32, i64 6, !dbg !16
|
| 129 |
+
%89 = extractelement <8 x i1> %32, i64 7, !dbg !16
|
| 130 |
+
br label %90, !dbg !15
|
| 131 |
+
|
| 132 |
+
90: ; preds = %8, %90
|
| 133 |
+
%indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %90 ]
|
| 134 |
+
%91 = phi <8 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %139, %90 ]
|
| 135 |
+
%92 = phi <8 x i32> [ splat (i32 2147483647), %8 ], [ %140, %90 ]
|
| 136 |
+
%93 = or disjoint i64 %indvars.iv, %81, !dbg !17
|
| 137 |
+
%gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %93, !dbg !18
|
| 138 |
+
%gep10 = getelementptr float, ptr addrspace(1) %invariant.gep9, i64 %93, !dbg !18
|
| 139 |
+
%gep12 = getelementptr float, ptr addrspace(1) %invariant.gep11, i64 %93, !dbg !18
|
| 140 |
+
%gep14 = getelementptr float, ptr addrspace(1) %invariant.gep13, i64 %93, !dbg !18
|
| 141 |
+
%gep16 = getelementptr float, ptr addrspace(1) %invariant.gep15, i64 %93, !dbg !18
|
| 142 |
+
%gep18 = getelementptr float, ptr addrspace(1) %invariant.gep17, i64 %93, !dbg !18
|
| 143 |
+
%gep20 = getelementptr float, ptr addrspace(1) %invariant.gep19, i64 %93, !dbg !18
|
| 144 |
+
%gep22 = getelementptr float, ptr addrspace(1) %invariant.gep21, i64 %93, !dbg !18
|
| 145 |
+
%94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 146 |
+
%95 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %94, i1 %89) #4, !dbg !16
|
| 147 |
+
%96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 148 |
+
%97 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep10, i64 %96, i1 %88) #4, !dbg !16
|
| 149 |
+
%98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 150 |
+
%99 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep12, i64 %98, i1 %87) #4, !dbg !16
|
| 151 |
+
%100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 152 |
+
%101 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep14, i64 %100, i1 %86) #4, !dbg !16
|
| 153 |
+
%102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 154 |
+
%103 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep16, i64 %102, i1 %85) #4, !dbg !16
|
| 155 |
+
%104 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 156 |
+
%105 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep18, i64 %104, i1 %84) #4, !dbg !16
|
| 157 |
+
%106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 158 |
+
%107 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep20, i64 %106, i1 %83) #4, !dbg !16
|
| 159 |
+
%108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !16
|
| 160 |
+
%109 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep22, i64 %108, i1 %82) #4, !dbg !16
|
| 161 |
+
%110 = fcmp uno <8 x float> %91, zeroinitializer, !dbg !19
|
| 162 |
+
%111 = trunc nuw nsw i64 %93 to i32, !dbg !23
|
| 163 |
+
%112 = insertelement <8 x i32> poison, i32 %109, i64 0, !dbg !16
|
| 164 |
+
%113 = insertelement <8 x i32> %112, i32 %107, i64 1, !dbg !16
|
| 165 |
+
%114 = insertelement <8 x i32> %113, i32 %105, i64 2, !dbg !16
|
| 166 |
+
%115 = insertelement <8 x i32> %114, i32 %103, i64 3, !dbg !16
|
| 167 |
+
%116 = insertelement <8 x i32> %115, i32 %101, i64 4, !dbg !16
|
| 168 |
+
%117 = insertelement <8 x i32> %116, i32 %99, i64 5, !dbg !16
|
| 169 |
+
%118 = insertelement <8 x i32> %117, i32 %97, i64 6, !dbg !16
|
| 170 |
+
%119 = insertelement <8 x i32> %118, i32 %95, i64 7, !dbg !16
|
| 171 |
+
%120 = bitcast <8 x i32> %119 to <8 x float>, !dbg !16
|
| 172 |
+
%121 = fcmp ogt <8 x float> %91, %120, !dbg !24
|
| 173 |
+
%122 = fcmp oeq <8 x float> %91, %120, !dbg !25
|
| 174 |
+
%123 = fcmp uno <8 x float> %120, zeroinitializer, !dbg !26
|
| 175 |
+
%124 = xor <8 x i1> %123, splat (i1 true), !dbg !27
|
| 176 |
+
%125 = and <8 x i1> %110, %124, !dbg !28
|
| 177 |
+
%126 = or <8 x i1> %121, %125, !dbg !29
|
| 178 |
+
%127 = and <8 x i1> %110, %123, !dbg !30
|
| 179 |
+
%128 = or <8 x i1> %122, %127, !dbg !31
|
| 180 |
+
%129 = insertelement <8 x i64> poison, i64 %93, i64 0, !dbg !32
|
| 181 |
+
%130 = shufflevector <8 x i64> %129, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !32
|
| 182 |
+
%131 = sext <8 x i32> %92 to <8 x i64>, !dbg !32
|
| 183 |
+
%132 = icmp sgt <8 x i64> %130, %131, !dbg !32
|
| 184 |
+
%133 = and <8 x i1> %132, %128, !dbg !33
|
| 185 |
+
%134 = or <8 x i1> %126, %133, !dbg !34
|
| 186 |
+
%135 = select <8 x i1> %134, <8 x float> %91, <8 x float> %120, !dbg !35
|
| 187 |
+
%136 = insertelement <8 x i32> poison, i32 %111, i64 0, !dbg !23
|
| 188 |
+
%137 = shufflevector <8 x i32> %136, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !23
|
| 189 |
+
%138 = select <8 x i1> %134, <8 x i32> %92, <8 x i32> %137, !dbg !23
|
| 190 |
+
%139 = select <8 x i1> %32, <8 x float> %135, <8 x float> %91, !dbg !36
|
| 191 |
+
%140 = select <8 x i1> %32, <8 x i32> %138, <8 x i32> %92, !dbg !37
|
| 192 |
+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 64, !dbg !15
|
| 193 |
+
%141 = icmp samesign ult i64 %indvars.iv, 31936, !dbg !15
|
| 194 |
+
br i1 %141, label %90, label %142, !dbg !15
|
| 195 |
+
|
| 196 |
+
142: ; preds = %90
|
| 197 |
+
%143 = or disjoint i32 %10, %13, !dbg !10
|
| 198 |
+
%144 = icmp slt i32 %143, %4, !dbg !11
|
| 199 |
+
%145 = and i32 %11, 31, !dbg !9
|
| 200 |
+
%146 = lshr i32 %11, 5, !dbg !9
|
| 201 |
+
%147 = extractelement <8 x float> %139, i64 7, !dbg !38
|
| 202 |
+
%148 = bitcast float %147 to i32, !dbg !38
|
| 203 |
+
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 16, i32 31), !dbg !38
|
| 204 |
+
%150 = bitcast i32 %149 to float, !dbg !38
|
| 205 |
+
%151 = extractelement <8 x i32> %140, i64 7, !dbg !38
|
| 206 |
+
%152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 16, i32 31), !dbg !38
|
| 207 |
+
%153 = fcmp ogt float %147, %150, !dbg !40
|
| 208 |
+
%154 = fcmp oeq float %147, %150, !dbg !41
|
| 209 |
+
%155 = fcmp uno <8 x float> %139, zeroinitializer, !dbg !42
|
| 210 |
+
%156 = fcmp uno float %150, 0.000000e+00, !dbg !43
|
| 211 |
+
%157 = xor i1 %156, true, !dbg !44
|
| 212 |
+
%158 = extractelement <8 x i1> %155, i64 7, !dbg !45
|
| 213 |
+
%159 = and i1 %158, %157, !dbg !46
|
| 214 |
+
%160 = or i1 %153, %159, !dbg !47
|
| 215 |
+
%161 = and i1 %158, %156, !dbg !45
|
| 216 |
+
%162 = or i1 %154, %161, !dbg !48
|
| 217 |
+
%163 = icmp slt i32 %151, %152, !dbg !49
|
| 218 |
+
%164 = and i1 %163, %162, !dbg !50
|
| 219 |
+
%165 = or i1 %160, %164, !dbg !51
|
| 220 |
+
%166 = select i1 %165, float %147, float %150, !dbg !52
|
| 221 |
+
%167 = select i1 %165, i32 %151, i32 %152, !dbg !53
|
| 222 |
+
%168 = bitcast float %166 to i32, !dbg !38
|
| 223 |
+
%169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 8, i32 31), !dbg !38
|
| 224 |
+
%170 = bitcast i32 %169 to float, !dbg !38
|
| 225 |
+
%171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 8, i32 31), !dbg !38
|
| 226 |
+
%172 = fcmp ogt float %166, %170, !dbg !40
|
| 227 |
+
%173 = fcmp oeq float %166, %170, !dbg !41
|
| 228 |
+
%174 = fcmp uno float %166, 0.000000e+00, !dbg !42
|
| 229 |
+
%175 = fcmp uno float %170, 0.000000e+00, !dbg !43
|
| 230 |
+
%176 = xor i1 %175, true, !dbg !44
|
| 231 |
+
%177 = and i1 %174, %176, !dbg !46
|
| 232 |
+
%178 = or i1 %172, %177, !dbg !47
|
| 233 |
+
%179 = and i1 %175, %174, !dbg !45
|
| 234 |
+
%180 = or i1 %173, %179, !dbg !48
|
| 235 |
+
%181 = icmp slt i32 %167, %171, !dbg !49
|
| 236 |
+
%182 = and i1 %181, %180, !dbg !50
|
| 237 |
+
%183 = or i1 %178, %182, !dbg !51
|
| 238 |
+
%184 = select i1 %183, float %166, float %170, !dbg !52
|
| 239 |
+
%185 = select i1 %183, i32 %167, i32 %171, !dbg !53
|
| 240 |
+
%186 = bitcast float %184 to i32, !dbg !38
|
| 241 |
+
%187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 4, i32 31), !dbg !38
|
| 242 |
+
%188 = bitcast i32 %187 to float, !dbg !38
|
| 243 |
+
%189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !38
|
| 244 |
+
%190 = fcmp ogt float %184, %188, !dbg !40
|
| 245 |
+
%191 = fcmp oeq float %184, %188, !dbg !41
|
| 246 |
+
%192 = fcmp uno float %184, 0.000000e+00, !dbg !42
|
| 247 |
+
%193 = fcmp uno float %188, 0.000000e+00, !dbg !43
|
| 248 |
+
%194 = xor i1 %193, true, !dbg !44
|
| 249 |
+
%195 = and i1 %192, %194, !dbg !46
|
| 250 |
+
%196 = or i1 %190, %195, !dbg !47
|
| 251 |
+
%197 = and i1 %193, %192, !dbg !45
|
| 252 |
+
%198 = or i1 %191, %197, !dbg !48
|
| 253 |
+
%199 = icmp slt i32 %185, %189, !dbg !49
|
| 254 |
+
%200 = and i1 %199, %198, !dbg !50
|
| 255 |
+
%201 = or i1 %196, %200, !dbg !51
|
| 256 |
+
%202 = select i1 %201, float %184, float %188, !dbg !52
|
| 257 |
+
%203 = select i1 %201, i32 %185, i32 %189, !dbg !53
|
| 258 |
+
%204 = bitcast float %202 to i32, !dbg !38
|
| 259 |
+
%205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 2, i32 31), !dbg !38
|
| 260 |
+
%206 = bitcast i32 %205 to float, !dbg !38
|
| 261 |
+
%207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !38
|
| 262 |
+
%208 = fcmp ogt float %202, %206, !dbg !40
|
| 263 |
+
%209 = fcmp oeq float %202, %206, !dbg !41
|
| 264 |
+
%210 = fcmp uno float %202, 0.000000e+00, !dbg !42
|
| 265 |
+
%211 = fcmp uno float %206, 0.000000e+00, !dbg !43
|
| 266 |
+
%212 = xor i1 %211, true, !dbg !44
|
| 267 |
+
%213 = and i1 %210, %212, !dbg !46
|
| 268 |
+
%214 = or i1 %208, %213, !dbg !47
|
| 269 |
+
%215 = and i1 %211, %210, !dbg !45
|
| 270 |
+
%216 = or i1 %209, %215, !dbg !48
|
| 271 |
+
%217 = icmp slt i32 %203, %207, !dbg !49
|
| 272 |
+
%218 = and i1 %217, %216, !dbg !50
|
| 273 |
+
%219 = or i1 %214, %218, !dbg !51
|
| 274 |
+
%220 = select i1 %219, float %202, float %206, !dbg !52
|
| 275 |
+
%221 = select i1 %219, i32 %203, i32 %207, !dbg !53
|
| 276 |
+
%222 = bitcast float %220 to i32, !dbg !38
|
| 277 |
+
%223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !38
|
| 278 |
+
%224 = bitcast i32 %223 to float, !dbg !38
|
| 279 |
+
%225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !38
|
| 280 |
+
%226 = fcmp ogt float %220, %224, !dbg !40
|
| 281 |
+
%227 = fcmp oeq float %220, %224, !dbg !41
|
| 282 |
+
%228 = fcmp uno float %220, 0.000000e+00, !dbg !42
|
| 283 |
+
%229 = fcmp uno float %224, 0.000000e+00, !dbg !43
|
| 284 |
+
%230 = xor i1 %229, true, !dbg !44
|
| 285 |
+
%231 = and i1 %228, %230, !dbg !46
|
| 286 |
+
%232 = or i1 %226, %231, !dbg !47
|
| 287 |
+
%233 = and i1 %229, %228, !dbg !45
|
| 288 |
+
%234 = or i1 %227, %233, !dbg !48
|
| 289 |
+
%235 = icmp slt i32 %221, %225, !dbg !49
|
| 290 |
+
%236 = and i1 %235, %234, !dbg !50
|
| 291 |
+
%237 = or i1 %232, %236, !dbg !51
|
| 292 |
+
%238 = select i1 %237, i32 %221, i32 %225, !dbg !53
|
| 293 |
+
%239 = extractelement <8 x float> %139, i64 6, !dbg !38
|
| 294 |
+
%240 = bitcast float %239 to i32, !dbg !38
|
| 295 |
+
%241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 16, i32 31), !dbg !38
|
| 296 |
+
%242 = bitcast i32 %241 to float, !dbg !38
|
| 297 |
+
%243 = extractelement <8 x i32> %140, i64 6, !dbg !38
|
| 298 |
+
%244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 16, i32 31), !dbg !38
|
| 299 |
+
%245 = fcmp ogt float %239, %242, !dbg !40
|
| 300 |
+
%246 = fcmp oeq float %239, %242, !dbg !41
|
| 301 |
+
%247 = fcmp uno float %242, 0.000000e+00, !dbg !43
|
| 302 |
+
%248 = xor i1 %247, true, !dbg !44
|
| 303 |
+
%249 = extractelement <8 x i1> %155, i64 6, !dbg !45
|
| 304 |
+
%250 = and i1 %249, %248, !dbg !46
|
| 305 |
+
%251 = or i1 %245, %250, !dbg !47
|
| 306 |
+
%252 = and i1 %249, %247, !dbg !45
|
| 307 |
+
%253 = or i1 %246, %252, !dbg !48
|
| 308 |
+
%254 = icmp slt i32 %243, %244, !dbg !49
|
| 309 |
+
%255 = and i1 %254, %253, !dbg !50
|
| 310 |
+
%256 = or i1 %251, %255, !dbg !51
|
| 311 |
+
%257 = select i1 %256, float %239, float %242, !dbg !52
|
| 312 |
+
%258 = select i1 %256, i32 %243, i32 %244, !dbg !53
|
| 313 |
+
%259 = bitcast float %257 to i32, !dbg !38
|
| 314 |
+
%260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 8, i32 31), !dbg !38
|
| 315 |
+
%261 = bitcast i32 %260 to float, !dbg !38
|
| 316 |
+
%262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 8, i32 31), !dbg !38
|
| 317 |
+
%263 = fcmp ogt float %257, %261, !dbg !40
|
| 318 |
+
%264 = fcmp oeq float %257, %261, !dbg !41
|
| 319 |
+
%265 = fcmp uno float %257, 0.000000e+00, !dbg !42
|
| 320 |
+
%266 = fcmp uno float %261, 0.000000e+00, !dbg !43
|
| 321 |
+
%267 = xor i1 %266, true, !dbg !44
|
| 322 |
+
%268 = and i1 %265, %267, !dbg !46
|
| 323 |
+
%269 = or i1 %263, %268, !dbg !47
|
| 324 |
+
%270 = and i1 %266, %265, !dbg !45
|
| 325 |
+
%271 = or i1 %264, %270, !dbg !48
|
| 326 |
+
%272 = icmp slt i32 %258, %262, !dbg !49
|
| 327 |
+
%273 = and i1 %272, %271, !dbg !50
|
| 328 |
+
%274 = or i1 %269, %273, !dbg !51
|
| 329 |
+
%275 = select i1 %274, float %257, float %261, !dbg !52
|
| 330 |
+
%276 = select i1 %274, i32 %258, i32 %262, !dbg !53
|
| 331 |
+
%277 = bitcast float %275 to i32, !dbg !38
|
| 332 |
+
%278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 4, i32 31), !dbg !38
|
| 333 |
+
%279 = bitcast i32 %278 to float, !dbg !38
|
| 334 |
+
%280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 4, i32 31), !dbg !38
|
| 335 |
+
%281 = fcmp ogt float %275, %279, !dbg !40
|
| 336 |
+
%282 = fcmp oeq float %275, %279, !dbg !41
|
| 337 |
+
%283 = fcmp uno float %275, 0.000000e+00, !dbg !42
|
| 338 |
+
%284 = fcmp uno float %279, 0.000000e+00, !dbg !43
|
| 339 |
+
%285 = xor i1 %284, true, !dbg !44
|
| 340 |
+
%286 = and i1 %283, %285, !dbg !46
|
| 341 |
+
%287 = or i1 %281, %286, !dbg !47
|
| 342 |
+
%288 = and i1 %284, %283, !dbg !45
|
| 343 |
+
%289 = or i1 %282, %288, !dbg !48
|
| 344 |
+
%290 = icmp slt i32 %276, %280, !dbg !49
|
| 345 |
+
%291 = and i1 %290, %289, !dbg !50
|
| 346 |
+
%292 = or i1 %287, %291, !dbg !51
|
| 347 |
+
%293 = select i1 %292, float %275, float %279, !dbg !52
|
| 348 |
+
%294 = select i1 %292, i32 %276, i32 %280, !dbg !53
|
| 349 |
+
%295 = bitcast float %293 to i32, !dbg !38
|
| 350 |
+
%296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 2, i32 31), !dbg !38
|
| 351 |
+
%297 = bitcast i32 %296 to float, !dbg !38
|
| 352 |
+
%298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !38
|
| 353 |
+
%299 = fcmp ogt float %293, %297, !dbg !40
|
| 354 |
+
%300 = fcmp oeq float %293, %297, !dbg !41
|
| 355 |
+
%301 = fcmp uno float %293, 0.000000e+00, !dbg !42
|
| 356 |
+
%302 = fcmp uno float %297, 0.000000e+00, !dbg !43
|
| 357 |
+
%303 = xor i1 %302, true, !dbg !44
|
| 358 |
+
%304 = and i1 %301, %303, !dbg !46
|
| 359 |
+
%305 = or i1 %299, %304, !dbg !47
|
| 360 |
+
%306 = and i1 %302, %301, !dbg !45
|
| 361 |
+
%307 = or i1 %300, %306, !dbg !48
|
| 362 |
+
%308 = icmp slt i32 %294, %298, !dbg !49
|
| 363 |
+
%309 = and i1 %308, %307, !dbg !50
|
| 364 |
+
%310 = or i1 %305, %309, !dbg !51
|
| 365 |
+
%311 = select i1 %310, float %293, float %297, !dbg !52
|
| 366 |
+
%312 = select i1 %310, i32 %294, i32 %298, !dbg !53
|
| 367 |
+
%313 = bitcast float %311 to i32, !dbg !38
|
| 368 |
+
%314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 1, i32 31), !dbg !38
|
| 369 |
+
%315 = bitcast i32 %314 to float, !dbg !38
|
| 370 |
+
%316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !38
|
| 371 |
+
%317 = fcmp ogt float %311, %315, !dbg !40
|
| 372 |
+
%318 = fcmp oeq float %311, %315, !dbg !41
|
| 373 |
+
%319 = fcmp uno float %311, 0.000000e+00, !dbg !42
|
| 374 |
+
%320 = fcmp uno float %315, 0.000000e+00, !dbg !43
|
| 375 |
+
%321 = xor i1 %320, true, !dbg !44
|
| 376 |
+
%322 = and i1 %319, %321, !dbg !46
|
| 377 |
+
%323 = or i1 %317, %322, !dbg !47
|
| 378 |
+
%324 = and i1 %320, %319, !dbg !45
|
| 379 |
+
%325 = or i1 %318, %324, !dbg !48
|
| 380 |
+
%326 = icmp slt i32 %312, %316, !dbg !49
|
| 381 |
+
%327 = and i1 %326, %325, !dbg !50
|
| 382 |
+
%328 = or i1 %323, %327, !dbg !51
|
| 383 |
+
%329 = select i1 %328, i32 %312, i32 %316, !dbg !53
|
| 384 |
+
%330 = extractelement <8 x float> %139, i64 5, !dbg !38
|
| 385 |
+
%331 = bitcast float %330 to i32, !dbg !38
|
| 386 |
+
%332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %331, i32 16, i32 31), !dbg !38
|
| 387 |
+
%333 = bitcast i32 %332 to float, !dbg !38
|
| 388 |
+
%334 = extractelement <8 x i32> %140, i64 5, !dbg !38
|
| 389 |
+
%335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 16, i32 31), !dbg !38
|
| 390 |
+
%336 = fcmp ogt float %330, %333, !dbg !40
|
| 391 |
+
%337 = fcmp oeq float %330, %333, !dbg !41
|
| 392 |
+
%338 = fcmp uno float %333, 0.000000e+00, !dbg !43
|
| 393 |
+
%339 = xor i1 %338, true, !dbg !44
|
| 394 |
+
%340 = extractelement <8 x i1> %155, i64 5, !dbg !45
|
| 395 |
+
%341 = and i1 %340, %339, !dbg !46
|
| 396 |
+
%342 = or i1 %336, %341, !dbg !47
|
| 397 |
+
%343 = and i1 %340, %338, !dbg !45
|
| 398 |
+
%344 = or i1 %337, %343, !dbg !48
|
| 399 |
+
%345 = icmp slt i32 %334, %335, !dbg !49
|
| 400 |
+
%346 = and i1 %345, %344, !dbg !50
|
| 401 |
+
%347 = or i1 %342, %346, !dbg !51
|
| 402 |
+
%348 = select i1 %347, float %330, float %333, !dbg !52
|
| 403 |
+
%349 = select i1 %347, i32 %334, i32 %335, !dbg !53
|
| 404 |
+
%350 = bitcast float %348 to i32, !dbg !38
|
| 405 |
+
%351 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %350, i32 8, i32 31), !dbg !38
|
| 406 |
+
%352 = bitcast i32 %351 to float, !dbg !38
|
| 407 |
+
%353 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 8, i32 31), !dbg !38
|
| 408 |
+
%354 = fcmp ogt float %348, %352, !dbg !40
|
| 409 |
+
%355 = fcmp oeq float %348, %352, !dbg !41
|
| 410 |
+
%356 = fcmp uno float %348, 0.000000e+00, !dbg !42
|
| 411 |
+
%357 = fcmp uno float %352, 0.000000e+00, !dbg !43
|
| 412 |
+
%358 = xor i1 %357, true, !dbg !44
|
| 413 |
+
%359 = and i1 %356, %358, !dbg !46
|
| 414 |
+
%360 = or i1 %354, %359, !dbg !47
|
| 415 |
+
%361 = and i1 %357, %356, !dbg !45
|
| 416 |
+
%362 = or i1 %355, %361, !dbg !48
|
| 417 |
+
%363 = icmp slt i32 %349, %353, !dbg !49
|
| 418 |
+
%364 = and i1 %363, %362, !dbg !50
|
| 419 |
+
%365 = or i1 %360, %364, !dbg !51
|
| 420 |
+
%366 = select i1 %365, float %348, float %352, !dbg !52
|
| 421 |
+
%367 = select i1 %365, i32 %349, i32 %353, !dbg !53
|
| 422 |
+
%368 = bitcast float %366 to i32, !dbg !38
|
| 423 |
+
%369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %368, i32 4, i32 31), !dbg !38
|
| 424 |
+
%370 = bitcast i32 %369 to float, !dbg !38
|
| 425 |
+
%371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 4, i32 31), !dbg !38
|
| 426 |
+
%372 = fcmp ogt float %366, %370, !dbg !40
|
| 427 |
+
%373 = fcmp oeq float %366, %370, !dbg !41
|
| 428 |
+
%374 = fcmp uno float %366, 0.000000e+00, !dbg !42
|
| 429 |
+
%375 = fcmp uno float %370, 0.000000e+00, !dbg !43
|
| 430 |
+
%376 = xor i1 %375, true, !dbg !44
|
| 431 |
+
%377 = and i1 %374, %376, !dbg !46
|
| 432 |
+
%378 = or i1 %372, %377, !dbg !47
|
| 433 |
+
%379 = and i1 %375, %374, !dbg !45
|
| 434 |
+
%380 = or i1 %373, %379, !dbg !48
|
| 435 |
+
%381 = icmp slt i32 %367, %371, !dbg !49
|
| 436 |
+
%382 = and i1 %381, %380, !dbg !50
|
| 437 |
+
%383 = or i1 %378, %382, !dbg !51
|
| 438 |
+
%384 = select i1 %383, float %366, float %370, !dbg !52
|
| 439 |
+
%385 = select i1 %383, i32 %367, i32 %371, !dbg !53
|
| 440 |
+
%386 = bitcast float %384 to i32, !dbg !38
|
| 441 |
+
%387 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 2, i32 31), !dbg !38
|
| 442 |
+
%388 = bitcast i32 %387 to float, !dbg !38
|
| 443 |
+
%389 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 2, i32 31), !dbg !38
|
| 444 |
+
%390 = fcmp ogt float %384, %388, !dbg !40
|
| 445 |
+
%391 = fcmp oeq float %384, %388, !dbg !41
|
| 446 |
+
%392 = fcmp uno float %384, 0.000000e+00, !dbg !42
|
| 447 |
+
%393 = fcmp uno float %388, 0.000000e+00, !dbg !43
|
| 448 |
+
%394 = xor i1 %393, true, !dbg !44
|
| 449 |
+
%395 = and i1 %392, %394, !dbg !46
|
| 450 |
+
%396 = or i1 %390, %395, !dbg !47
|
| 451 |
+
%397 = and i1 %393, %392, !dbg !45
|
| 452 |
+
%398 = or i1 %391, %397, !dbg !48
|
| 453 |
+
%399 = icmp slt i32 %385, %389, !dbg !49
|
| 454 |
+
%400 = and i1 %399, %398, !dbg !50
|
| 455 |
+
%401 = or i1 %396, %400, !dbg !51
|
| 456 |
+
%402 = select i1 %401, float %384, float %388, !dbg !52
|
| 457 |
+
%403 = select i1 %401, i32 %385, i32 %389, !dbg !53
|
| 458 |
+
%404 = bitcast float %402 to i32, !dbg !38
|
| 459 |
+
%405 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %404, i32 1, i32 31), !dbg !38
|
| 460 |
+
%406 = bitcast i32 %405 to float, !dbg !38
|
| 461 |
+
%407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 1, i32 31), !dbg !38
|
| 462 |
+
%408 = fcmp ogt float %402, %406, !dbg !40
|
| 463 |
+
%409 = fcmp oeq float %402, %406, !dbg !41
|
| 464 |
+
%410 = fcmp uno float %402, 0.000000e+00, !dbg !42
|
| 465 |
+
%411 = fcmp uno float %406, 0.000000e+00, !dbg !43
|
| 466 |
+
%412 = xor i1 %411, true, !dbg !44
|
| 467 |
+
%413 = and i1 %410, %412, !dbg !46
|
| 468 |
+
%414 = or i1 %408, %413, !dbg !47
|
| 469 |
+
%415 = and i1 %411, %410, !dbg !45
|
| 470 |
+
%416 = or i1 %409, %415, !dbg !48
|
| 471 |
+
%417 = icmp slt i32 %403, %407, !dbg !49
|
| 472 |
+
%418 = and i1 %417, %416, !dbg !50
|
| 473 |
+
%419 = or i1 %414, %418, !dbg !51
|
| 474 |
+
%420 = select i1 %419, i32 %403, i32 %407, !dbg !53
|
| 475 |
+
%421 = extractelement <8 x float> %139, i64 4, !dbg !38
|
| 476 |
+
%422 = bitcast float %421 to i32, !dbg !38
|
| 477 |
+
%423 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %422, i32 16, i32 31), !dbg !38
|
| 478 |
+
%424 = bitcast i32 %423 to float, !dbg !38
|
| 479 |
+
%425 = extractelement <8 x i32> %140, i64 4, !dbg !38
|
| 480 |
+
%426 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %425, i32 16, i32 31), !dbg !38
|
| 481 |
+
%427 = fcmp ogt float %421, %424, !dbg !40
|
| 482 |
+
%428 = fcmp oeq float %421, %424, !dbg !41
|
| 483 |
+
%429 = fcmp uno float %424, 0.000000e+00, !dbg !43
|
| 484 |
+
%430 = xor i1 %429, true, !dbg !44
|
| 485 |
+
%431 = extractelement <8 x i1> %155, i64 4, !dbg !45
|
| 486 |
+
%432 = and i1 %431, %430, !dbg !46
|
| 487 |
+
%433 = or i1 %427, %432, !dbg !47
|
| 488 |
+
%434 = and i1 %431, %429, !dbg !45
|
| 489 |
+
%435 = or i1 %428, %434, !dbg !48
|
| 490 |
+
%436 = icmp slt i32 %425, %426, !dbg !49
|
| 491 |
+
%437 = and i1 %436, %435, !dbg !50
|
| 492 |
+
%438 = or i1 %433, %437, !dbg !51
|
| 493 |
+
%439 = select i1 %438, float %421, float %424, !dbg !52
|
| 494 |
+
%440 = select i1 %438, i32 %425, i32 %426, !dbg !53
|
| 495 |
+
%441 = bitcast float %439 to i32, !dbg !38
|
| 496 |
+
%442 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %441, i32 8, i32 31), !dbg !38
|
| 497 |
+
%443 = bitcast i32 %442 to float, !dbg !38
|
| 498 |
+
%444 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %440, i32 8, i32 31), !dbg !38
|
| 499 |
+
%445 = fcmp ogt float %439, %443, !dbg !40
|
| 500 |
+
%446 = fcmp oeq float %439, %443, !dbg !41
|
| 501 |
+
%447 = fcmp uno float %439, 0.000000e+00, !dbg !42
|
| 502 |
+
%448 = fcmp uno float %443, 0.000000e+00, !dbg !43
|
| 503 |
+
%449 = xor i1 %448, true, !dbg !44
|
| 504 |
+
%450 = and i1 %447, %449, !dbg !46
|
| 505 |
+
%451 = or i1 %445, %450, !dbg !47
|
| 506 |
+
%452 = and i1 %448, %447, !dbg !45
|
| 507 |
+
%453 = or i1 %446, %452, !dbg !48
|
| 508 |
+
%454 = icmp slt i32 %440, %444, !dbg !49
|
| 509 |
+
%455 = and i1 %454, %453, !dbg !50
|
| 510 |
+
%456 = or i1 %451, %455, !dbg !51
|
| 511 |
+
%457 = select i1 %456, float %439, float %443, !dbg !52
|
| 512 |
+
%458 = select i1 %456, i32 %440, i32 %444, !dbg !53
|
| 513 |
+
%459 = bitcast float %457 to i32, !dbg !38
|
| 514 |
+
%460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 4, i32 31), !dbg !38
|
| 515 |
+
%461 = bitcast i32 %460 to float, !dbg !38
|
| 516 |
+
%462 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 4, i32 31), !dbg !38
|
| 517 |
+
%463 = fcmp ogt float %457, %461, !dbg !40
|
| 518 |
+
%464 = fcmp oeq float %457, %461, !dbg !41
|
| 519 |
+
%465 = fcmp uno float %457, 0.000000e+00, !dbg !42
|
| 520 |
+
%466 = fcmp uno float %461, 0.000000e+00, !dbg !43
|
| 521 |
+
%467 = xor i1 %466, true, !dbg !44
|
| 522 |
+
%468 = and i1 %465, %467, !dbg !46
|
| 523 |
+
%469 = or i1 %463, %468, !dbg !47
|
| 524 |
+
%470 = and i1 %466, %465, !dbg !45
|
| 525 |
+
%471 = or i1 %464, %470, !dbg !48
|
| 526 |
+
%472 = icmp slt i32 %458, %462, !dbg !49
|
| 527 |
+
%473 = and i1 %472, %471, !dbg !50
|
| 528 |
+
%474 = or i1 %469, %473, !dbg !51
|
| 529 |
+
%475 = select i1 %474, float %457, float %461, !dbg !52
|
| 530 |
+
%476 = select i1 %474, i32 %458, i32 %462, !dbg !53
|
| 531 |
+
%477 = bitcast float %475 to i32, !dbg !38
|
| 532 |
+
%478 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %477, i32 2, i32 31), !dbg !38
|
| 533 |
+
%479 = bitcast i32 %478 to float, !dbg !38
|
| 534 |
+
%480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %476, i32 2, i32 31), !dbg !38
|
| 535 |
+
%481 = fcmp ogt float %475, %479, !dbg !40
|
| 536 |
+
%482 = fcmp oeq float %475, %479, !dbg !41
|
| 537 |
+
%483 = fcmp uno float %475, 0.000000e+00, !dbg !42
|
| 538 |
+
%484 = fcmp uno float %479, 0.000000e+00, !dbg !43
|
| 539 |
+
%485 = xor i1 %484, true, !dbg !44
|
| 540 |
+
%486 = and i1 %483, %485, !dbg !46
|
| 541 |
+
%487 = or i1 %481, %486, !dbg !47
|
| 542 |
+
%488 = and i1 %484, %483, !dbg !45
|
| 543 |
+
%489 = or i1 %482, %488, !dbg !48
|
| 544 |
+
%490 = icmp slt i32 %476, %480, !dbg !49
|
| 545 |
+
%491 = and i1 %490, %489, !dbg !50
|
| 546 |
+
%492 = or i1 %487, %491, !dbg !51
|
| 547 |
+
%493 = select i1 %492, float %475, float %479, !dbg !52
|
| 548 |
+
%494 = select i1 %492, i32 %476, i32 %480, !dbg !53
|
| 549 |
+
%495 = bitcast float %493 to i32, !dbg !38
|
| 550 |
+
%496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 1, i32 31), !dbg !38
|
| 551 |
+
%497 = bitcast i32 %496 to float, !dbg !38
|
| 552 |
+
%498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 1, i32 31), !dbg !38
|
| 553 |
+
%499 = fcmp ogt float %493, %497, !dbg !40
|
| 554 |
+
%500 = fcmp oeq float %493, %497, !dbg !41
|
| 555 |
+
%501 = fcmp uno float %493, 0.000000e+00, !dbg !42
|
| 556 |
+
%502 = fcmp uno float %497, 0.000000e+00, !dbg !43
|
| 557 |
+
%503 = xor i1 %502, true, !dbg !44
|
| 558 |
+
%504 = and i1 %501, %503, !dbg !46
|
| 559 |
+
%505 = or i1 %499, %504, !dbg !47
|
| 560 |
+
%506 = and i1 %502, %501, !dbg !45
|
| 561 |
+
%507 = or i1 %500, %506, !dbg !48
|
| 562 |
+
%508 = icmp slt i32 %494, %498, !dbg !49
|
| 563 |
+
%509 = and i1 %508, %507, !dbg !50
|
| 564 |
+
%510 = or i1 %505, %509, !dbg !51
|
| 565 |
+
%511 = select i1 %510, i32 %494, i32 %498, !dbg !53
|
| 566 |
+
%512 = extractelement <8 x float> %139, i64 3, !dbg !38
|
| 567 |
+
%513 = bitcast float %512 to i32, !dbg !38
|
| 568 |
+
%514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 16, i32 31), !dbg !38
|
| 569 |
+
%515 = bitcast i32 %514 to float, !dbg !38
|
| 570 |
+
%516 = extractelement <8 x i32> %140, i64 3, !dbg !38
|
| 571 |
+
%517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 16, i32 31), !dbg !38
|
| 572 |
+
%518 = fcmp ogt float %512, %515, !dbg !40
|
| 573 |
+
%519 = fcmp oeq float %512, %515, !dbg !41
|
| 574 |
+
%520 = fcmp uno float %515, 0.000000e+00, !dbg !43
|
| 575 |
+
%521 = xor i1 %520, true, !dbg !44
|
| 576 |
+
%522 = extractelement <8 x i1> %155, i64 3, !dbg !45
|
| 577 |
+
%523 = and i1 %522, %521, !dbg !46
|
| 578 |
+
%524 = or i1 %518, %523, !dbg !47
|
| 579 |
+
%525 = and i1 %522, %520, !dbg !45
|
| 580 |
+
%526 = or i1 %519, %525, !dbg !48
|
| 581 |
+
%527 = icmp slt i32 %516, %517, !dbg !49
|
| 582 |
+
%528 = and i1 %527, %526, !dbg !50
|
| 583 |
+
%529 = or i1 %524, %528, !dbg !51
|
| 584 |
+
%530 = select i1 %529, float %512, float %515, !dbg !52
|
| 585 |
+
%531 = select i1 %529, i32 %516, i32 %517, !dbg !53
|
| 586 |
+
%532 = bitcast float %530 to i32, !dbg !38
|
| 587 |
+
%533 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %532, i32 8, i32 31), !dbg !38
|
| 588 |
+
%534 = bitcast i32 %533 to float, !dbg !38
|
| 589 |
+
%535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %531, i32 8, i32 31), !dbg !38
|
| 590 |
+
%536 = fcmp ogt float %530, %534, !dbg !40
|
| 591 |
+
%537 = fcmp oeq float %530, %534, !dbg !41
|
| 592 |
+
%538 = fcmp uno float %530, 0.000000e+00, !dbg !42
|
| 593 |
+
%539 = fcmp uno float %534, 0.000000e+00, !dbg !43
|
| 594 |
+
%540 = xor i1 %539, true, !dbg !44
|
| 595 |
+
%541 = and i1 %538, %540, !dbg !46
|
| 596 |
+
%542 = or i1 %536, %541, !dbg !47
|
| 597 |
+
%543 = and i1 %539, %538, !dbg !45
|
| 598 |
+
%544 = or i1 %537, %543, !dbg !48
|
| 599 |
+
%545 = icmp slt i32 %531, %535, !dbg !49
|
| 600 |
+
%546 = and i1 %545, %544, !dbg !50
|
| 601 |
+
%547 = or i1 %542, %546, !dbg !51
|
| 602 |
+
%548 = select i1 %547, float %530, float %534, !dbg !52
|
| 603 |
+
%549 = select i1 %547, i32 %531, i32 %535, !dbg !53
|
| 604 |
+
%550 = bitcast float %548 to i32, !dbg !38
|
| 605 |
+
%551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 4, i32 31), !dbg !38
|
| 606 |
+
%552 = bitcast i32 %551 to float, !dbg !38
|
| 607 |
+
%553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 4, i32 31), !dbg !38
|
| 608 |
+
%554 = fcmp ogt float %548, %552, !dbg !40
|
| 609 |
+
%555 = fcmp oeq float %548, %552, !dbg !41
|
| 610 |
+
%556 = fcmp uno float %548, 0.000000e+00, !dbg !42
|
| 611 |
+
%557 = fcmp uno float %552, 0.000000e+00, !dbg !43
|
| 612 |
+
%558 = xor i1 %557, true, !dbg !44
|
| 613 |
+
%559 = and i1 %556, %558, !dbg !46
|
| 614 |
+
%560 = or i1 %554, %559, !dbg !47
|
| 615 |
+
%561 = and i1 %557, %556, !dbg !45
|
| 616 |
+
%562 = or i1 %555, %561, !dbg !48
|
| 617 |
+
%563 = icmp slt i32 %549, %553, !dbg !49
|
| 618 |
+
%564 = and i1 %563, %562, !dbg !50
|
| 619 |
+
%565 = or i1 %560, %564, !dbg !51
|
| 620 |
+
%566 = select i1 %565, float %548, float %552, !dbg !52
|
| 621 |
+
%567 = select i1 %565, i32 %549, i32 %553, !dbg !53
|
| 622 |
+
%568 = bitcast float %566 to i32, !dbg !38
|
| 623 |
+
%569 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %568, i32 2, i32 31), !dbg !38
|
| 624 |
+
%570 = bitcast i32 %569 to float, !dbg !38
|
| 625 |
+
%571 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %567, i32 2, i32 31), !dbg !38
|
| 626 |
+
%572 = fcmp ogt float %566, %570, !dbg !40
|
| 627 |
+
%573 = fcmp oeq float %566, %570, !dbg !41
|
| 628 |
+
%574 = fcmp uno float %566, 0.000000e+00, !dbg !42
|
| 629 |
+
%575 = fcmp uno float %570, 0.000000e+00, !dbg !43
|
| 630 |
+
%576 = xor i1 %575, true, !dbg !44
|
| 631 |
+
%577 = and i1 %574, %576, !dbg !46
|
| 632 |
+
%578 = or i1 %572, %577, !dbg !47
|
| 633 |
+
%579 = and i1 %575, %574, !dbg !45
|
| 634 |
+
%580 = or i1 %573, %579, !dbg !48
|
| 635 |
+
%581 = icmp slt i32 %567, %571, !dbg !49
|
| 636 |
+
%582 = and i1 %581, %580, !dbg !50
|
| 637 |
+
%583 = or i1 %578, %582, !dbg !51
|
| 638 |
+
%584 = select i1 %583, float %566, float %570, !dbg !52
|
| 639 |
+
%585 = select i1 %583, i32 %567, i32 %571, !dbg !53
|
| 640 |
+
%586 = bitcast float %584 to i32, !dbg !38
|
| 641 |
+
%587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 1, i32 31), !dbg !38
|
| 642 |
+
%588 = bitcast i32 %587 to float, !dbg !38
|
| 643 |
+
%589 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %585, i32 1, i32 31), !dbg !38
|
| 644 |
+
%590 = fcmp ogt float %584, %588, !dbg !40
|
| 645 |
+
%591 = fcmp oeq float %584, %588, !dbg !41
|
| 646 |
+
%592 = fcmp uno float %584, 0.000000e+00, !dbg !42
|
| 647 |
+
%593 = fcmp uno float %588, 0.000000e+00, !dbg !43
|
| 648 |
+
%594 = xor i1 %593, true, !dbg !44
|
| 649 |
+
%595 = and i1 %592, %594, !dbg !46
|
| 650 |
+
%596 = or i1 %590, %595, !dbg !47
|
| 651 |
+
%597 = and i1 %593, %592, !dbg !45
|
| 652 |
+
%598 = or i1 %591, %597, !dbg !48
|
| 653 |
+
%599 = icmp slt i32 %585, %589, !dbg !49
|
| 654 |
+
%600 = and i1 %599, %598, !dbg !50
|
| 655 |
+
%601 = or i1 %596, %600, !dbg !51
|
| 656 |
+
%602 = select i1 %601, i32 %585, i32 %589, !dbg !53
|
| 657 |
+
%603 = extractelement <8 x float> %139, i64 2, !dbg !38
|
| 658 |
+
%604 = bitcast float %603 to i32, !dbg !38
|
| 659 |
+
%605 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %604, i32 16, i32 31), !dbg !38
|
| 660 |
+
%606 = bitcast i32 %605 to float, !dbg !38
|
| 661 |
+
%607 = extractelement <8 x i32> %140, i64 2, !dbg !38
|
| 662 |
+
%608 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %607, i32 16, i32 31), !dbg !38
|
| 663 |
+
%609 = fcmp ogt float %603, %606, !dbg !40
|
| 664 |
+
%610 = fcmp oeq float %603, %606, !dbg !41
|
| 665 |
+
%611 = fcmp uno float %606, 0.000000e+00, !dbg !43
|
| 666 |
+
%612 = xor i1 %611, true, !dbg !44
|
| 667 |
+
%613 = extractelement <8 x i1> %155, i64 2, !dbg !45
|
| 668 |
+
%614 = and i1 %613, %612, !dbg !46
|
| 669 |
+
%615 = or i1 %609, %614, !dbg !47
|
| 670 |
+
%616 = and i1 %613, %611, !dbg !45
|
| 671 |
+
%617 = or i1 %610, %616, !dbg !48
|
| 672 |
+
%618 = icmp slt i32 %607, %608, !dbg !49
|
| 673 |
+
%619 = and i1 %618, %617, !dbg !50
|
| 674 |
+
%620 = or i1 %615, %619, !dbg !51
|
| 675 |
+
%621 = select i1 %620, float %603, float %606, !dbg !52
|
| 676 |
+
%622 = select i1 %620, i32 %607, i32 %608, !dbg !53
|
| 677 |
+
%623 = bitcast float %621 to i32, !dbg !38
|
| 678 |
+
%624 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 8, i32 31), !dbg !38
|
| 679 |
+
%625 = bitcast i32 %624 to float, !dbg !38
|
| 680 |
+
%626 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %622, i32 8, i32 31), !dbg !38
|
| 681 |
+
%627 = fcmp ogt float %621, %625, !dbg !40
|
| 682 |
+
%628 = fcmp oeq float %621, %625, !dbg !41
|
| 683 |
+
%629 = fcmp uno float %621, 0.000000e+00, !dbg !42
|
| 684 |
+
%630 = fcmp uno float %625, 0.000000e+00, !dbg !43
|
| 685 |
+
%631 = xor i1 %630, true, !dbg !44
|
| 686 |
+
%632 = and i1 %629, %631, !dbg !46
|
| 687 |
+
%633 = or i1 %627, %632, !dbg !47
|
| 688 |
+
%634 = and i1 %630, %629, !dbg !45
|
| 689 |
+
%635 = or i1 %628, %634, !dbg !48
|
| 690 |
+
%636 = icmp slt i32 %622, %626, !dbg !49
|
| 691 |
+
%637 = and i1 %636, %635, !dbg !50
|
| 692 |
+
%638 = or i1 %633, %637, !dbg !51
|
| 693 |
+
%639 = select i1 %638, float %621, float %625, !dbg !52
|
| 694 |
+
%640 = select i1 %638, i32 %622, i32 %626, !dbg !53
|
| 695 |
+
%641 = bitcast float %639 to i32, !dbg !38
|
| 696 |
+
%642 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %641, i32 4, i32 31), !dbg !38
|
| 697 |
+
%643 = bitcast i32 %642 to float, !dbg !38
|
| 698 |
+
%644 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 4, i32 31), !dbg !38
|
| 699 |
+
%645 = fcmp ogt float %639, %643, !dbg !40
|
| 700 |
+
%646 = fcmp oeq float %639, %643, !dbg !41
|
| 701 |
+
%647 = fcmp uno float %639, 0.000000e+00, !dbg !42
|
| 702 |
+
%648 = fcmp uno float %643, 0.000000e+00, !dbg !43
|
| 703 |
+
%649 = xor i1 %648, true, !dbg !44
|
| 704 |
+
%650 = and i1 %647, %649, !dbg !46
|
| 705 |
+
%651 = or i1 %645, %650, !dbg !47
|
| 706 |
+
%652 = and i1 %648, %647, !dbg !45
|
| 707 |
+
%653 = or i1 %646, %652, !dbg !48
|
| 708 |
+
%654 = icmp slt i32 %640, %644, !dbg !49
|
| 709 |
+
%655 = and i1 %654, %653, !dbg !50
|
| 710 |
+
%656 = or i1 %651, %655, !dbg !51
|
| 711 |
+
%657 = select i1 %656, float %639, float %643, !dbg !52
|
| 712 |
+
%658 = select i1 %656, i32 %640, i32 %644, !dbg !53
|
| 713 |
+
%659 = bitcast float %657 to i32, !dbg !38
|
| 714 |
+
%660 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %659, i32 2, i32 31), !dbg !38
|
| 715 |
+
%661 = bitcast i32 %660 to float, !dbg !38
|
| 716 |
+
%662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 2, i32 31), !dbg !38
|
| 717 |
+
%663 = fcmp ogt float %657, %661, !dbg !40
|
| 718 |
+
%664 = fcmp oeq float %657, %661, !dbg !41
|
| 719 |
+
%665 = fcmp uno float %657, 0.000000e+00, !dbg !42
|
| 720 |
+
%666 = fcmp uno float %661, 0.000000e+00, !dbg !43
|
| 721 |
+
%667 = xor i1 %666, true, !dbg !44
|
| 722 |
+
%668 = and i1 %665, %667, !dbg !46
|
| 723 |
+
%669 = or i1 %663, %668, !dbg !47
|
| 724 |
+
%670 = and i1 %666, %665, !dbg !45
|
| 725 |
+
%671 = or i1 %664, %670, !dbg !48
|
| 726 |
+
%672 = icmp slt i32 %658, %662, !dbg !49
|
| 727 |
+
%673 = and i1 %672, %671, !dbg !50
|
| 728 |
+
%674 = or i1 %669, %673, !dbg !51
|
| 729 |
+
%675 = select i1 %674, float %657, float %661, !dbg !52
|
| 730 |
+
%676 = select i1 %674, i32 %658, i32 %662, !dbg !53
|
| 731 |
+
%677 = bitcast float %675 to i32, !dbg !38
|
| 732 |
+
%678 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %677, i32 1, i32 31), !dbg !38
|
| 733 |
+
%679 = bitcast i32 %678 to float, !dbg !38
|
| 734 |
+
%680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 1, i32 31), !dbg !38
|
| 735 |
+
%681 = fcmp ogt float %675, %679, !dbg !40
|
| 736 |
+
%682 = fcmp oeq float %675, %679, !dbg !41
|
| 737 |
+
%683 = fcmp uno float %675, 0.000000e+00, !dbg !42
|
| 738 |
+
%684 = fcmp uno float %679, 0.000000e+00, !dbg !43
|
| 739 |
+
%685 = xor i1 %684, true, !dbg !44
|
| 740 |
+
%686 = and i1 %683, %685, !dbg !46
|
| 741 |
+
%687 = or i1 %681, %686, !dbg !47
|
| 742 |
+
%688 = and i1 %684, %683, !dbg !45
|
| 743 |
+
%689 = or i1 %682, %688, !dbg !48
|
| 744 |
+
%690 = icmp slt i32 %676, %680, !dbg !49
|
| 745 |
+
%691 = and i1 %690, %689, !dbg !50
|
| 746 |
+
%692 = or i1 %687, %691, !dbg !51
|
| 747 |
+
%693 = select i1 %692, i32 %676, i32 %680, !dbg !53
|
| 748 |
+
%694 = extractelement <8 x float> %139, i64 1, !dbg !38
|
| 749 |
+
%695 = bitcast float %694 to i32, !dbg !38
|
| 750 |
+
%696 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %695, i32 16, i32 31), !dbg !38
|
| 751 |
+
%697 = bitcast i32 %696 to float, !dbg !38
|
| 752 |
+
%698 = extractelement <8 x i32> %140, i64 1, !dbg !38
|
| 753 |
+
%699 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %698, i32 16, i32 31), !dbg !38
|
| 754 |
+
%700 = fcmp ogt float %694, %697, !dbg !40
|
| 755 |
+
%701 = fcmp oeq float %694, %697, !dbg !41
|
| 756 |
+
%702 = fcmp uno float %697, 0.000000e+00, !dbg !43
|
| 757 |
+
%703 = xor i1 %702, true, !dbg !44
|
| 758 |
+
%704 = extractelement <8 x i1> %155, i64 1, !dbg !45
|
| 759 |
+
%705 = and i1 %704, %703, !dbg !46
|
| 760 |
+
%706 = or i1 %700, %705, !dbg !47
|
| 761 |
+
%707 = and i1 %704, %702, !dbg !45
|
| 762 |
+
%708 = or i1 %701, %707, !dbg !48
|
| 763 |
+
%709 = icmp slt i32 %698, %699, !dbg !49
|
| 764 |
+
%710 = and i1 %709, %708, !dbg !50
|
| 765 |
+
%711 = or i1 %706, %710, !dbg !51
|
| 766 |
+
%712 = select i1 %711, float %694, float %697, !dbg !52
|
| 767 |
+
%713 = select i1 %711, i32 %698, i32 %699, !dbg !53
|
| 768 |
+
%714 = bitcast float %712 to i32, !dbg !38
|
| 769 |
+
%715 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %714, i32 8, i32 31), !dbg !38
|
| 770 |
+
%716 = bitcast i32 %715 to float, !dbg !38
|
| 771 |
+
%717 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %713, i32 8, i32 31), !dbg !38
|
| 772 |
+
%718 = fcmp ogt float %712, %716, !dbg !40
|
| 773 |
+
%719 = fcmp oeq float %712, %716, !dbg !41
|
| 774 |
+
%720 = fcmp uno float %712, 0.000000e+00, !dbg !42
|
| 775 |
+
%721 = fcmp uno float %716, 0.000000e+00, !dbg !43
|
| 776 |
+
%722 = xor i1 %721, true, !dbg !44
|
| 777 |
+
%723 = and i1 %720, %722, !dbg !46
|
| 778 |
+
%724 = or i1 %718, %723, !dbg !47
|
| 779 |
+
%725 = and i1 %721, %720, !dbg !45
|
| 780 |
+
%726 = or i1 %719, %725, !dbg !48
|
| 781 |
+
%727 = icmp slt i32 %713, %717, !dbg !49
|
| 782 |
+
%728 = and i1 %727, %726, !dbg !50
|
| 783 |
+
%729 = or i1 %724, %728, !dbg !51
|
| 784 |
+
%730 = select i1 %729, float %712, float %716, !dbg !52
|
| 785 |
+
%731 = select i1 %729, i32 %713, i32 %717, !dbg !53
|
| 786 |
+
%732 = bitcast float %730 to i32, !dbg !38
|
| 787 |
+
%733 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %732, i32 4, i32 31), !dbg !38
|
| 788 |
+
%734 = bitcast i32 %733 to float, !dbg !38
|
| 789 |
+
%735 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %731, i32 4, i32 31), !dbg !38
|
| 790 |
+
%736 = fcmp ogt float %730, %734, !dbg !40
|
| 791 |
+
%737 = fcmp oeq float %730, %734, !dbg !41
|
| 792 |
+
%738 = fcmp uno float %730, 0.000000e+00, !dbg !42
|
| 793 |
+
%739 = fcmp uno float %734, 0.000000e+00, !dbg !43
|
| 794 |
+
%740 = xor i1 %739, true, !dbg !44
|
| 795 |
+
%741 = and i1 %738, %740, !dbg !46
|
| 796 |
+
%742 = or i1 %736, %741, !dbg !47
|
| 797 |
+
%743 = and i1 %739, %738, !dbg !45
|
| 798 |
+
%744 = or i1 %737, %743, !dbg !48
|
| 799 |
+
%745 = icmp slt i32 %731, %735, !dbg !49
|
| 800 |
+
%746 = and i1 %745, %744, !dbg !50
|
| 801 |
+
%747 = or i1 %742, %746, !dbg !51
|
| 802 |
+
%748 = select i1 %747, float %730, float %734, !dbg !52
|
| 803 |
+
%749 = select i1 %747, i32 %731, i32 %735, !dbg !53
|
| 804 |
+
%750 = bitcast float %748 to i32, !dbg !38
|
| 805 |
+
%751 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %750, i32 2, i32 31), !dbg !38
|
| 806 |
+
%752 = bitcast i32 %751 to float, !dbg !38
|
| 807 |
+
%753 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %749, i32 2, i32 31), !dbg !38
|
| 808 |
+
%754 = fcmp ogt float %748, %752, !dbg !40
|
| 809 |
+
%755 = fcmp oeq float %748, %752, !dbg !41
|
| 810 |
+
%756 = fcmp uno float %748, 0.000000e+00, !dbg !42
|
| 811 |
+
%757 = fcmp uno float %752, 0.000000e+00, !dbg !43
|
| 812 |
+
%758 = xor i1 %757, true, !dbg !44
|
| 813 |
+
%759 = and i1 %756, %758, !dbg !46
|
| 814 |
+
%760 = or i1 %754, %759, !dbg !47
|
| 815 |
+
%761 = and i1 %757, %756, !dbg !45
|
| 816 |
+
%762 = or i1 %755, %761, !dbg !48
|
| 817 |
+
%763 = icmp slt i32 %749, %753, !dbg !49
|
| 818 |
+
%764 = and i1 %763, %762, !dbg !50
|
| 819 |
+
%765 = or i1 %760, %764, !dbg !51
|
| 820 |
+
%766 = select i1 %765, float %748, float %752, !dbg !52
|
| 821 |
+
%767 = select i1 %765, i32 %749, i32 %753, !dbg !53
|
| 822 |
+
%768 = bitcast float %766 to i32, !dbg !38
|
| 823 |
+
%769 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %768, i32 1, i32 31), !dbg !38
|
| 824 |
+
%770 = bitcast i32 %769 to float, !dbg !38
|
| 825 |
+
%771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 1, i32 31), !dbg !38
|
| 826 |
+
%772 = fcmp ogt float %766, %770, !dbg !40
|
| 827 |
+
%773 = fcmp oeq float %766, %770, !dbg !41
|
| 828 |
+
%774 = fcmp uno float %766, 0.000000e+00, !dbg !42
|
| 829 |
+
%775 = fcmp uno float %770, 0.000000e+00, !dbg !43
|
| 830 |
+
%776 = xor i1 %775, true, !dbg !44
|
| 831 |
+
%777 = and i1 %774, %776, !dbg !46
|
| 832 |
+
%778 = or i1 %772, %777, !dbg !47
|
| 833 |
+
%779 = and i1 %775, %774, !dbg !45
|
| 834 |
+
%780 = or i1 %773, %779, !dbg !48
|
| 835 |
+
%781 = icmp slt i32 %767, %771, !dbg !49
|
| 836 |
+
%782 = and i1 %781, %780, !dbg !50
|
| 837 |
+
%783 = or i1 %778, %782, !dbg !51
|
| 838 |
+
%784 = select i1 %783, i32 %767, i32 %771, !dbg !53
|
| 839 |
+
%785 = extractelement <8 x float> %139, i64 0, !dbg !38
|
| 840 |
+
%786 = bitcast float %785 to i32, !dbg !38
|
| 841 |
+
%787 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %786, i32 16, i32 31), !dbg !38
|
| 842 |
+
%788 = bitcast i32 %787 to float, !dbg !38
|
| 843 |
+
%789 = extractelement <8 x i32> %140, i64 0, !dbg !38
|
| 844 |
+
%790 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %789, i32 16, i32 31), !dbg !38
|
| 845 |
+
%791 = fcmp ogt float %785, %788, !dbg !40
|
| 846 |
+
%792 = fcmp oeq float %785, %788, !dbg !41
|
| 847 |
+
%793 = fcmp uno float %788, 0.000000e+00, !dbg !43
|
| 848 |
+
%794 = xor i1 %793, true, !dbg !44
|
| 849 |
+
%795 = extractelement <8 x i1> %155, i64 0, !dbg !45
|
| 850 |
+
%796 = and i1 %795, %794, !dbg !46
|
| 851 |
+
%797 = or i1 %791, %796, !dbg !47
|
| 852 |
+
%798 = and i1 %795, %793, !dbg !45
|
| 853 |
+
%799 = or i1 %792, %798, !dbg !48
|
| 854 |
+
%800 = icmp slt i32 %789, %790, !dbg !49
|
| 855 |
+
%801 = and i1 %800, %799, !dbg !50
|
| 856 |
+
%802 = or i1 %797, %801, !dbg !51
|
| 857 |
+
%803 = select i1 %802, float %785, float %788, !dbg !52
|
| 858 |
+
%804 = select i1 %802, i32 %789, i32 %790, !dbg !53
|
| 859 |
+
%805 = bitcast float %803 to i32, !dbg !38
|
| 860 |
+
%806 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %805, i32 8, i32 31), !dbg !38
|
| 861 |
+
%807 = bitcast i32 %806 to float, !dbg !38
|
| 862 |
+
%808 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %804, i32 8, i32 31), !dbg !38
|
| 863 |
+
%809 = fcmp ogt float %803, %807, !dbg !40
|
| 864 |
+
%810 = fcmp oeq float %803, %807, !dbg !41
|
| 865 |
+
%811 = fcmp uno float %803, 0.000000e+00, !dbg !42
|
| 866 |
+
%812 = fcmp uno float %807, 0.000000e+00, !dbg !43
|
| 867 |
+
%813 = xor i1 %812, true, !dbg !44
|
| 868 |
+
%814 = and i1 %811, %813, !dbg !46
|
| 869 |
+
%815 = or i1 %809, %814, !dbg !47
|
| 870 |
+
%816 = and i1 %812, %811, !dbg !45
|
| 871 |
+
%817 = or i1 %810, %816, !dbg !48
|
| 872 |
+
%818 = icmp slt i32 %804, %808, !dbg !49
|
| 873 |
+
%819 = and i1 %818, %817, !dbg !50
|
| 874 |
+
%820 = or i1 %815, %819, !dbg !51
|
| 875 |
+
%821 = select i1 %820, float %803, float %807, !dbg !52
|
| 876 |
+
%822 = select i1 %820, i32 %804, i32 %808, !dbg !53
|
| 877 |
+
%823 = bitcast float %821 to i32, !dbg !38
|
| 878 |
+
%824 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 4, i32 31), !dbg !38
|
| 879 |
+
%825 = bitcast i32 %824 to float, !dbg !38
|
| 880 |
+
%826 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %822, i32 4, i32 31), !dbg !38
|
| 881 |
+
%827 = fcmp ogt float %821, %825, !dbg !40
|
| 882 |
+
%828 = fcmp oeq float %821, %825, !dbg !41
|
| 883 |
+
%829 = fcmp uno float %821, 0.000000e+00, !dbg !42
|
| 884 |
+
%830 = fcmp uno float %825, 0.000000e+00, !dbg !43
|
| 885 |
+
%831 = xor i1 %830, true, !dbg !44
|
| 886 |
+
%832 = and i1 %829, %831, !dbg !46
|
| 887 |
+
%833 = or i1 %827, %832, !dbg !47
|
| 888 |
+
%834 = and i1 %830, %829, !dbg !45
|
| 889 |
+
%835 = or i1 %828, %834, !dbg !48
|
| 890 |
+
%836 = icmp slt i32 %822, %826, !dbg !49
|
| 891 |
+
%837 = and i1 %836, %835, !dbg !50
|
| 892 |
+
%838 = or i1 %833, %837, !dbg !51
|
| 893 |
+
%839 = select i1 %838, float %821, float %825, !dbg !52
|
| 894 |
+
%840 = select i1 %838, i32 %822, i32 %826, !dbg !53
|
| 895 |
+
%841 = bitcast float %839 to i32, !dbg !38
|
| 896 |
+
%842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %841, i32 2, i32 31), !dbg !38
|
| 897 |
+
%843 = bitcast i32 %842 to float, !dbg !38
|
| 898 |
+
%844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 2, i32 31), !dbg !38
|
| 899 |
+
%845 = fcmp ogt float %839, %843, !dbg !40
|
| 900 |
+
%846 = fcmp oeq float %839, %843, !dbg !41
|
| 901 |
+
%847 = fcmp uno float %839, 0.000000e+00, !dbg !42
|
| 902 |
+
%848 = fcmp uno float %843, 0.000000e+00, !dbg !43
|
| 903 |
+
%849 = xor i1 %848, true, !dbg !44
|
| 904 |
+
%850 = and i1 %847, %849, !dbg !46
|
| 905 |
+
%851 = or i1 %845, %850, !dbg !47
|
| 906 |
+
%852 = and i1 %848, %847, !dbg !45
|
| 907 |
+
%853 = or i1 %846, %852, !dbg !48
|
| 908 |
+
%854 = icmp slt i32 %840, %844, !dbg !49
|
| 909 |
+
%855 = and i1 %854, %853, !dbg !50
|
| 910 |
+
%856 = or i1 %851, %855, !dbg !51
|
| 911 |
+
%857 = select i1 %856, float %839, float %843, !dbg !52
|
| 912 |
+
%858 = select i1 %856, i32 %840, i32 %844, !dbg !53
|
| 913 |
+
%859 = bitcast float %857 to i32, !dbg !38
|
| 914 |
+
%860 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %859, i32 1, i32 31), !dbg !38
|
| 915 |
+
%861 = bitcast i32 %860 to float, !dbg !38
|
| 916 |
+
%862 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %858, i32 1, i32 31), !dbg !38
|
| 917 |
+
%863 = fcmp ogt float %857, %861, !dbg !40
|
| 918 |
+
%864 = fcmp oeq float %857, %861, !dbg !41
|
| 919 |
+
%865 = fcmp uno float %857, 0.000000e+00, !dbg !42
|
| 920 |
+
%866 = fcmp uno float %861, 0.000000e+00, !dbg !43
|
| 921 |
+
%867 = xor i1 %866, true, !dbg !44
|
| 922 |
+
%868 = and i1 %865, %867, !dbg !46
|
| 923 |
+
%869 = or i1 %863, %868, !dbg !47
|
| 924 |
+
%870 = and i1 %866, %865, !dbg !45
|
| 925 |
+
%871 = or i1 %864, %870, !dbg !48
|
| 926 |
+
%872 = icmp slt i32 %858, %862, !dbg !49
|
| 927 |
+
%873 = and i1 %872, %871, !dbg !50
|
| 928 |
+
%874 = or i1 %869, %873, !dbg !51
|
| 929 |
+
%875 = select i1 %874, i32 %858, i32 %862, !dbg !53
|
| 930 |
+
%876 = and i32 %146, 1, !dbg !38
|
| 931 |
+
%877 = icmp eq i32 %145, 0, !dbg !38
|
| 932 |
+
%878 = lshr exact i32 %12, 5, !dbg !38
|
| 933 |
+
%879 = or disjoint i32 %878, %876, !dbg !38
|
| 934 |
+
%880 = getelementptr float, ptr addrspace(3) @global_smem, i32 %879, !dbg !38
|
| 935 |
+
%881 = select i1 %237, i32 %222, i32 %223, !dbg !52
|
| 936 |
+
%882 = insertelement <1 x i32> poison, i32 %881, i64 0, !dbg !38
|
| 937 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %880, <1 x i32> %882, i1 %877) #4, !dbg !38
|
| 938 |
+
%883 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %879, !dbg !38
|
| 939 |
+
%884 = insertelement <1 x i32> poison, i32 %238, i64 0, !dbg !38
|
| 940 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %883, <1 x i32> %884, i1 %877) #4, !dbg !38
|
| 941 |
+
%885 = shl nuw nsw i32 %15, 1, !dbg !38
|
| 942 |
+
%886 = or disjoint i32 %885, %876, !dbg !38
|
| 943 |
+
%887 = getelementptr float, ptr addrspace(3) @global_smem, i32 %886, !dbg !38
|
| 944 |
+
%888 = select i1 %328, i32 %313, i32 %314, !dbg !52
|
| 945 |
+
%889 = insertelement <1 x i32> poison, i32 %888, i64 0, !dbg !38
|
| 946 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %887, <1 x i32> %889, i1 %877) #4, !dbg !38
|
| 947 |
+
%890 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %886, !dbg !38
|
| 948 |
+
%891 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !38
|
| 949 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %890, <1 x i32> %891, i1 %877) #4, !dbg !38
|
| 950 |
+
%892 = shl nuw nsw i32 %16, 1, !dbg !38
|
| 951 |
+
%893 = or disjoint i32 %892, %876, !dbg !38
|
| 952 |
+
%894 = getelementptr float, ptr addrspace(3) @global_smem, i32 %893, !dbg !38
|
| 953 |
+
%895 = select i1 %419, i32 %404, i32 %405, !dbg !52
|
| 954 |
+
%896 = insertelement <1 x i32> poison, i32 %895, i64 0, !dbg !38
|
| 955 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %894, <1 x i32> %896, i1 %877) #4, !dbg !38
|
| 956 |
+
%897 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %893, !dbg !38
|
| 957 |
+
%898 = insertelement <1 x i32> poison, i32 %420, i64 0, !dbg !38
|
| 958 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %897, <1 x i32> %898, i1 %877) #4, !dbg !38
|
| 959 |
+
%899 = shl nuw nsw i32 %17, 1, !dbg !38
|
| 960 |
+
%900 = or disjoint i32 %899, %876, !dbg !38
|
| 961 |
+
%901 = getelementptr float, ptr addrspace(3) @global_smem, i32 %900, !dbg !38
|
| 962 |
+
%902 = select i1 %510, i32 %495, i32 %496, !dbg !52
|
| 963 |
+
%903 = insertelement <1 x i32> poison, i32 %902, i64 0, !dbg !38
|
| 964 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %901, <1 x i32> %903, i1 %877) #4, !dbg !38
|
| 965 |
+
%904 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %900, !dbg !38
|
| 966 |
+
%905 = insertelement <1 x i32> poison, i32 %511, i64 0, !dbg !38
|
| 967 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %904, <1 x i32> %905, i1 %877) #4, !dbg !38
|
| 968 |
+
%906 = extractelement <4 x i32> %20, i64 3, !dbg !38
|
| 969 |
+
%907 = shl nuw nsw i32 %906, 1, !dbg !38
|
| 970 |
+
%908 = or disjoint i32 %907, %876, !dbg !38
|
| 971 |
+
%909 = getelementptr float, ptr addrspace(3) @global_smem, i32 %908, !dbg !38
|
| 972 |
+
%910 = select i1 %601, i32 %586, i32 %587, !dbg !52
|
| 973 |
+
%911 = insertelement <1 x i32> poison, i32 %910, i64 0, !dbg !38
|
| 974 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %909, <1 x i32> %911, i1 %877) #4, !dbg !38
|
| 975 |
+
%912 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %908, !dbg !38
|
| 976 |
+
%913 = insertelement <1 x i32> poison, i32 %602, i64 0, !dbg !38
|
| 977 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %912, <1 x i32> %913, i1 %877) #4, !dbg !38
|
| 978 |
+
%914 = extractelement <4 x i32> %20, i64 2, !dbg !38
|
| 979 |
+
%915 = shl nuw nsw i32 %914, 1, !dbg !38
|
| 980 |
+
%916 = or disjoint i32 %915, %876, !dbg !38
|
| 981 |
+
%917 = getelementptr float, ptr addrspace(3) @global_smem, i32 %916, !dbg !38
|
| 982 |
+
%918 = select i1 %692, i32 %677, i32 %678, !dbg !52
|
| 983 |
+
%919 = insertelement <1 x i32> poison, i32 %918, i64 0, !dbg !38
|
| 984 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %917, <1 x i32> %919, i1 %877) #4, !dbg !38
|
| 985 |
+
%920 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %916, !dbg !38
|
| 986 |
+
%921 = insertelement <1 x i32> poison, i32 %693, i64 0, !dbg !38
|
| 987 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %920, <1 x i32> %921, i1 %877) #4, !dbg !38
|
| 988 |
+
%922 = extractelement <4 x i32> %20, i64 1, !dbg !38
|
| 989 |
+
%923 = shl nuw nsw i32 %922, 1, !dbg !38
|
| 990 |
+
%924 = or disjoint i32 %923, %876, !dbg !38
|
| 991 |
+
%925 = getelementptr float, ptr addrspace(3) @global_smem, i32 %924, !dbg !38
|
| 992 |
+
%926 = select i1 %783, i32 %768, i32 %769, !dbg !52
|
| 993 |
+
%927 = insertelement <1 x i32> poison, i32 %926, i64 0, !dbg !38
|
| 994 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %925, <1 x i32> %927, i1 %877) #4, !dbg !38
|
| 995 |
+
%928 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %924, !dbg !38
|
| 996 |
+
%929 = insertelement <1 x i32> poison, i32 %784, i64 0, !dbg !38
|
| 997 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %928, <1 x i32> %929, i1 %877) #4, !dbg !38
|
| 998 |
+
%930 = extractelement <4 x i32> %20, i64 0, !dbg !38
|
| 999 |
+
%931 = shl nuw nsw i32 %930, 1, !dbg !38
|
| 1000 |
+
%932 = or disjoint i32 %931, %876, !dbg !38
|
| 1001 |
+
%933 = getelementptr float, ptr addrspace(3) @global_smem, i32 %932, !dbg !38
|
| 1002 |
+
%934 = select i1 %874, i32 %859, i32 %860, !dbg !52
|
| 1003 |
+
%935 = insertelement <1 x i32> poison, i32 %934, i64 0, !dbg !38
|
| 1004 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %933, <1 x i32> %935, i1 %877) #4, !dbg !38
|
| 1005 |
+
%936 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %932, !dbg !38
|
| 1006 |
+
%937 = insertelement <1 x i32> poison, i32 %875, i64 0, !dbg !38
|
| 1007 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %936, <1 x i32> %937, i1 %877) #4, !dbg !38
|
| 1008 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
|
| 1009 |
+
%938 = icmp samesign ult i32 %11, 128, !dbg !38
|
| 1010 |
+
%939 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !38
|
| 1011 |
+
%940 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %939, i1 %938) #4, !dbg !38
|
| 1012 |
+
%941 = bitcast i32 %940 to float, !dbg !38
|
| 1013 |
+
%942 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %11, !dbg !38
|
| 1014 |
+
%943 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %942, i1 %938) #4, !dbg !38
|
| 1015 |
+
%944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %940, i32 1, i32 31), !dbg !38
|
| 1016 |
+
%945 = bitcast i32 %944 to float, !dbg !38
|
| 1017 |
+
%946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 1, i32 31), !dbg !38
|
| 1018 |
+
%947 = fcmp ogt float %941, %945, !dbg !40
|
| 1019 |
+
%948 = fcmp oeq float %941, %945, !dbg !41
|
| 1020 |
+
%949 = fcmp uno float %941, 0.000000e+00, !dbg !42
|
| 1021 |
+
%950 = fcmp uno float %945, 0.000000e+00, !dbg !43
|
| 1022 |
+
%951 = xor i1 %950, true, !dbg !44
|
| 1023 |
+
%952 = and i1 %949, %951, !dbg !46
|
| 1024 |
+
%953 = or i1 %947, %952, !dbg !47
|
| 1025 |
+
%954 = and i1 %949, %950, !dbg !45
|
| 1026 |
+
%955 = or i1 %948, %954, !dbg !48
|
| 1027 |
+
%956 = icmp slt i32 %943, %946, !dbg !49
|
| 1028 |
+
%957 = and i1 %956, %955, !dbg !50
|
| 1029 |
+
%958 = or i1 %953, %957, !dbg !51
|
| 1030 |
+
%959 = select i1 %958, i32 %943, i32 %946, !dbg !53
|
| 1031 |
+
%960 = and i32 %11, 897, !dbg !38
|
| 1032 |
+
%961 = icmp eq i32 %960, 0, !dbg !38
|
| 1033 |
+
%962 = select i1 %958, i32 %940, i32 %944, !dbg !52
|
| 1034 |
+
%963 = insertelement <1 x i32> poison, i32 %962, i64 0, !dbg !38
|
| 1035 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %939, <1 x i32> %963, i1 %961) #4, !dbg !38
|
| 1036 |
+
%964 = insertelement <1 x i32> poison, i32 %959, i64 0, !dbg !38
|
| 1037 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %942, <1 x i32> %964, i1 %961) #4, !dbg !38
|
| 1038 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
|
| 1039 |
+
%965 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %878, !dbg !38
|
| 1040 |
+
%966 = load i32, ptr addrspace(3) %965, align 8, !dbg !38
|
| 1041 |
+
%967 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %885, !dbg !38
|
| 1042 |
+
%968 = load i32, ptr addrspace(3) %967, align 8, !dbg !38
|
| 1043 |
+
%969 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %892, !dbg !38
|
| 1044 |
+
%970 = load i32, ptr addrspace(3) %969, align 8, !dbg !38
|
| 1045 |
+
%971 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %899, !dbg !38
|
| 1046 |
+
%972 = load i32, ptr addrspace(3) %971, align 8, !dbg !38
|
| 1047 |
+
%973 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %907, !dbg !38
|
| 1048 |
+
%974 = load i32, ptr addrspace(3) %973, align 8, !dbg !38
|
| 1049 |
+
%975 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %915, !dbg !38
|
| 1050 |
+
%976 = load i32, ptr addrspace(3) %975, align 8, !dbg !38
|
| 1051 |
+
%977 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %923, !dbg !38
|
| 1052 |
+
%978 = load i32, ptr addrspace(3) %977, align 8, !dbg !38
|
| 1053 |
+
%979 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 512), i32 %931, !dbg !38
|
| 1054 |
+
%980 = load i32, ptr addrspace(3) %979, align 8, !dbg !38
|
| 1055 |
+
%981 = sext i32 %143 to i64, !dbg !54
|
| 1056 |
+
%982 = getelementptr i64, ptr addrspace(1) %1, i64 %981, !dbg !54
|
| 1057 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55
|
| 1058 |
+
%983 = lshr exact i32 %12, 2, !dbg !55
|
| 1059 |
+
%984 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %983, !dbg !55
|
| 1060 |
+
%985 = insertelement <4 x i32> poison, i32 %966, i64 0, !dbg !55
|
| 1061 |
+
%986 = insertelement <4 x i32> %985, i32 %968, i64 1, !dbg !55
|
| 1062 |
+
%987 = insertelement <4 x i32> %986, i32 %970, i64 2, !dbg !55
|
| 1063 |
+
%988 = insertelement <4 x i32> %987, i32 %972, i64 3, !dbg !55
|
| 1064 |
+
store <4 x i32> %988, ptr addrspace(3) %984, align 16, !dbg !55
|
| 1065 |
+
%989 = getelementptr inbounds nuw i8, ptr addrspace(3) %984, i32 128, !dbg !55
|
| 1066 |
+
%990 = insertelement <4 x i32> poison, i32 %974, i64 0, !dbg !55
|
| 1067 |
+
%991 = insertelement <4 x i32> %990, i32 %976, i64 1, !dbg !55
|
| 1068 |
+
%992 = insertelement <4 x i32> %991, i32 %978, i64 2, !dbg !55
|
| 1069 |
+
%993 = insertelement <4 x i32> %992, i32 %980, i64 3, !dbg !55
|
| 1070 |
+
store <4 x i32> %993, ptr addrspace(3) %989, align 16, !dbg !55
|
| 1071 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55
|
| 1072 |
+
%994 = shl nuw nsw i32 %11, 4, !dbg !55
|
| 1073 |
+
%995 = and i32 %994, 112, !dbg !55
|
| 1074 |
+
%996 = lshr i32 %11, 1, !dbg !55
|
| 1075 |
+
%997 = and i32 %996, 12, !dbg !55
|
| 1076 |
+
%998 = shl nuw nsw i32 %11, 2, !dbg !55
|
| 1077 |
+
%999 = and i32 %998, 128, !dbg !55
|
| 1078 |
+
%1000 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %995, !dbg !55
|
| 1079 |
+
%1001 = getelementptr inbounds nuw i8, ptr addrspace(3) %1000, i32 %999, !dbg !55
|
| 1080 |
+
%1002 = getelementptr inbounds nuw i8, ptr addrspace(3) %1001, i32 %997, !dbg !55
|
| 1081 |
+
%1003 = load i32, ptr addrspace(3) %1002, align 4, !dbg !55
|
| 1082 |
+
%1004 = sext i32 %1003 to i64, !dbg !55
|
| 1083 |
+
%1005 = icmp eq i32 %12, 0, !dbg !55
|
| 1084 |
+
%1006 = and i1 %1005, %144, !dbg !55
|
| 1085 |
+
tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %1004, ptr addrspace(1) %982, i1 %1006) #4, !dbg !55
|
| 1086 |
+
ret void, !dbg !56
|
| 1087 |
+
}
|
| 1088 |
+
|
| 1089 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 1090 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 1091 |
+
|
| 1092 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 1093 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 1094 |
+
|
| 1095 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 1096 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 1097 |
+
|
| 1098 |
+
; Function Attrs: convergent nocallback nounwind
|
| 1099 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 1100 |
+
|
| 1101 |
+
attributes #0 = { nounwind "nvvm.reqntid"="512" }
|
| 1102 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 1103 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 1104 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 1105 |
+
attributes #4 = { nounwind }
|
| 1106 |
+
|
| 1107 |
+
!llvm.dbg.cu = !{!0}
|
| 1108 |
+
!llvm.module.flags = !{!2, !3}
|
| 1109 |
+
|
| 1110 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 1111 |
+
!1 = !DIFile(filename: "c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w")
|
| 1112 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 1113 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 1114 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 1115 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 1116 |
+
!6 = !{}
|
| 1117 |
+
!7 = !DILocation(line: 22, column: 28, scope: !4)
|
| 1118 |
+
!8 = !DILocation(line: 22, column: 33, scope: !4)
|
| 1119 |
+
!9 = !DILocation(line: 23, column: 44, scope: !4)
|
| 1120 |
+
!10 = !DILocation(line: 23, column: 23, scope: !4)
|
| 1121 |
+
!11 = !DILocation(line: 24, column: 21, scope: !4)
|
| 1122 |
+
!12 = !DILocation(line: 27, column: 19, scope: !4)
|
| 1123 |
+
!13 = !DILocation(line: 28, column: 19, scope: !4)
|
| 1124 |
+
!14 = !DILocation(line: 38, column: 56, scope: !4)
|
| 1125 |
+
!15 = !DILocation(line: 32, column: 40, scope: !4)
|
| 1126 |
+
!16 = !DILocation(line: 38, column: 61, scope: !4)
|
| 1127 |
+
!17 = !DILocation(line: 33, column: 31, scope: !4)
|
| 1128 |
+
!18 = !DILocation(line: 38, column: 34, scope: !4)
|
| 1129 |
+
!19 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !22)
|
| 1130 |
+
!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0)
|
| 1131 |
+
!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 1132 |
+
!22 = !DILocation(line: 41, column: 38, scope: !4)
|
| 1133 |
+
!23 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !22)
|
| 1134 |
+
!24 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !22)
|
| 1135 |
+
!25 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !22)
|
| 1136 |
+
!26 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !22)
|
| 1137 |
+
!27 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !22)
|
| 1138 |
+
!28 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !22)
|
| 1139 |
+
!29 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !22)
|
| 1140 |
+
!30 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !22)
|
| 1141 |
+
!31 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !22)
|
| 1142 |
+
!32 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !22)
|
| 1143 |
+
!33 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !22)
|
| 1144 |
+
!34 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !22)
|
| 1145 |
+
!35 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !22)
|
| 1146 |
+
!36 = !DILocation(line: 43, column: 54, scope: !4)
|
| 1147 |
+
!37 = !DILocation(line: 44, column: 66, scope: !4)
|
| 1148 |
+
!38 = !DILocation(line: 165, column: 42, scope: !20, inlinedAt: !39)
|
| 1149 |
+
!39 = !DILocation(line: 45, column: 75, scope: !4)
|
| 1150 |
+
!40 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !39)
|
| 1151 |
+
!41 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !39)
|
| 1152 |
+
!42 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !39)
|
| 1153 |
+
!43 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !39)
|
| 1154 |
+
!44 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !39)
|
| 1155 |
+
!45 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !39)
|
| 1156 |
+
!46 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !39)
|
| 1157 |
+
!47 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !39)
|
| 1158 |
+
!48 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !39)
|
| 1159 |
+
!49 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !39)
|
| 1160 |
+
!50 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !39)
|
| 1161 |
+
!51 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !39)
|
| 1162 |
+
!52 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !39)
|
| 1163 |
+
!53 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !39)
|
| 1164 |
+
!54 = !DILocation(line: 47, column: 25, scope: !4)
|
| 1165 |
+
!55 = !DILocation(line: 47, column: 36, scope: !4)
|
| 1166 |
+
!56 = !DILocation(line: 47, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ptx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.source
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0)
|
| 2 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0)
|
| 3 |
+
#loc47 = loc(unknown)
|
| 4 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
|
| 5 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
|
| 6 |
+
#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0)
|
| 7 |
+
#loc72 = loc("in_ptr0"(#loc))
|
| 8 |
+
#loc73 = loc("out_ptr0"(#loc))
|
| 9 |
+
#loc74 = loc("ks0"(#loc))
|
| 10 |
+
#loc75 = loc("ks1"(#loc))
|
| 11 |
+
#loc76 = loc("xnumel"(#loc))
|
| 12 |
+
#loc77 = loc("r0_numel"(#loc))
|
| 13 |
+
#loc106 = loc("a_value"(#loc35))
|
| 14 |
+
#loc107 = loc("a_index"(#loc35))
|
| 15 |
+
#loc108 = loc("b_value"(#loc35))
|
| 16 |
+
#loc109 = loc("b_index"(#loc35))
|
| 17 |
+
#loc122 = loc("x"(#loc55))
|
| 18 |
+
#loc123 = loc("x"(#loc59))
|
| 19 |
+
#loc124 = loc("value"(#loc68))
|
| 20 |
+
#loc125 = loc("index"(#loc68))
|
| 21 |
+
module {
|
| 22 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 23 |
+
%r0_numel_0 = arith.constant 32000 : i32 loc(#loc78)
|
| 24 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc79)
|
| 25 |
+
%xoffset_1 = arith.constant 64 : i32 loc(#loc80)
|
| 26 |
+
%xoffset_2 = arith.constant 64 : i32 loc(#loc80)
|
| 27 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80)
|
| 28 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc81)
|
| 29 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc82)
|
| 30 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc83)
|
| 31 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc83)
|
| 32 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc84)
|
| 33 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc84)
|
| 34 |
+
%r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc85)
|
| 35 |
+
%r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc86)
|
| 36 |
+
%x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc87)
|
| 37 |
+
%x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc87)
|
| 38 |
+
%x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc87)
|
| 39 |
+
%x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc88)
|
| 40 |
+
%x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc88)
|
| 41 |
+
%x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc88)
|
| 42 |
+
%_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89)
|
| 43 |
+
%_tmp2_13 = arith.constant dense<0xFF800000> : tensor<64x64xf32> loc(#loc89)
|
| 44 |
+
%_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90)
|
| 45 |
+
%_tmp2_index_14 = arith.constant dense<2147483647> : tensor<64x64xi32> loc(#loc90)
|
| 46 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc14)
|
| 47 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc14)
|
| 48 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
|
| 49 |
+
%1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14)
|
| 50 |
+
%2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc14)
|
| 51 |
+
%3 = ub.poison : i32 loc(#loc14)
|
| 52 |
+
%_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<64x64xf32>, tensor<64x64xi32>) : i32 {
|
| 53 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc92)
|
| 54 |
+
%r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x64xi32> loc(#loc92)
|
| 55 |
+
%r0_mask = arith.constant dense<32000> : tensor<1x64xi32> loc(#loc93)
|
| 56 |
+
%r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x64xi32> loc(#loc93)
|
| 57 |
+
%tmp0 = arith.constant 32000 : i32 loc(#loc94)
|
| 58 |
+
%tmp0_20 = arith.constant 32000 : i64 loc(#loc94)
|
| 59 |
+
%tmp0_21 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc94)
|
| 60 |
+
%tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<64x1xi64> loc(#loc94)
|
| 61 |
+
%tmp0_23 = arith.extsi %r0_index_18 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc95)
|
| 62 |
+
%tmp0_24 = tt.broadcast %tmp0_23 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc95)
|
| 63 |
+
%tmp0_25 = tt.broadcast %tmp0_22 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc95)
|
| 64 |
+
%tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<64x64xi64> loc(#loc95)
|
| 65 |
+
%tmp0_27 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96)
|
| 66 |
+
%tmp0_28 = arith.muli %tmp0_27, %x1_12 : tensor<64x1xi64> loc(#loc96)
|
| 67 |
+
%tmp0_29 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc97)
|
| 68 |
+
%tmp0_30 = arith.addi %tmp0_26, %tmp0_29 : tensor<64x64xi64> loc(#loc97)
|
| 69 |
+
%tmp0_31 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc98)
|
| 70 |
+
%tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi64> loc(#loc98)
|
| 71 |
+
%tmp0_33 = tt.broadcast %r0_mask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc99)
|
| 72 |
+
%tmp0_34 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc99)
|
| 73 |
+
%tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<64x64xi1> loc(#loc99)
|
| 74 |
+
%tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc100)
|
| 75 |
+
%tmp0_37 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc100)
|
| 76 |
+
%tmp0_38 = tt.load %tmp0_32, %tmp0_35, %tmp0_37 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<f32>> loc(#loc100)
|
| 77 |
+
%8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_64S_i32S64_64S_fp32S64_64S_i32S1_64S__(%_tmp2_16, %_tmp2_index_17, %tmp0_38, %r0_index_18) : (tensor<64x64xf32>, tensor<64x64xi32>, tensor<64x64xf32>, tensor<1x64xi32>) -> (tensor<64x64xf32>, tensor<64x64xi32>) loc(#loc24)
|
| 78 |
+
%_tmp2_39 = tt.broadcast %r0_mask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc101)
|
| 79 |
+
%_tmp2_40 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc101)
|
| 80 |
+
%_tmp2_41 = arith.andi %_tmp2_39, %_tmp2_40 : tensor<64x64xi1> loc(#loc101)
|
| 81 |
+
%_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_16 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc102)
|
| 82 |
+
%_tmp2_index_43 = tt.broadcast %r0_mask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc103)
|
| 83 |
+
%_tmp2_index_44 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc103)
|
| 84 |
+
%_tmp2_index_45 = arith.andi %_tmp2_index_43, %_tmp2_index_44 : tensor<64x64xi1> loc(#loc103)
|
| 85 |
+
%_tmp2_index_46 = arith.select %_tmp2_index_45, %8#1, %_tmp2_index_17 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc104)
|
| 86 |
+
scf.yield %_tmp2_42, %_tmp2_index_46 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc29)
|
| 87 |
+
} loc(#loc126)
|
| 88 |
+
%4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_64S_i32S64_64S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc30)
|
| 89 |
+
%tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc105)
|
| 90 |
+
%5 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<64x1x!tt.ptr<i64>> loc(#loc32)
|
| 91 |
+
%6 = tt.addptr %5, %xindex_6 : tensor<64x1x!tt.ptr<i64>>, tensor<64x1xi32> loc(#loc32)
|
| 92 |
+
%7 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc33)
|
| 93 |
+
tt.store %6, %7, %xmask_7 : tensor<64x1x!tt.ptr<i64>> loc(#loc33)
|
| 94 |
+
tt.return loc(#loc34)
|
| 95 |
+
} loc(#loc)
|
| 96 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_64S_i32S64_64S_fp32S64_64S_i32S1_64S__(%a_value: tensor<64x64xf32> loc("a_value"(#loc35)), %a_index: tensor<64x64xi32> loc("a_index"(#loc35)), %b_value: tensor<64x64xf32> loc("b_value"(#loc35)), %b_index: tensor<1x64xi32> loc("b_index"(#loc35))) -> (tensor<64x64xf32>, tensor<64x64xi32>) attributes {noinline = false} {
|
| 97 |
+
%mask = arith.cmpf ogt, %a_value, %b_value : tensor<64x64xf32> loc(#loc127)
|
| 98 |
+
%equal = arith.cmpf oeq, %a_value, %b_value : tensor<64x64xf32> loc(#loc128)
|
| 99 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_64S__(%a_value) : (tensor<64x64xf32>) -> i1 loc(#loc38)
|
| 100 |
+
%1:2 = scf.if %0 -> (tensor<64x64xi1>, tensor<64x64xi1>) {
|
| 101 |
+
%a_isnan = arith.cmpf une, %a_value, %a_value : tensor<64x64xf32> loc(#loc112)
|
| 102 |
+
%b_isnan = arith.cmpf une, %b_value, %b_value : tensor<64x64xf32> loc(#loc113)
|
| 103 |
+
%mask_4 = arith.constant true loc(#loc114)
|
| 104 |
+
%mask_5 = arith.constant dense<true> : tensor<64x64xi1> loc(#loc114)
|
| 105 |
+
%mask_6 = arith.xori %b_isnan, %mask_5 : tensor<64x64xi1> loc(#loc114)
|
| 106 |
+
%mask_7 = arith.andi %a_isnan, %mask_6 : tensor<64x64xi1> loc(#loc115)
|
| 107 |
+
%mask_8 = arith.ori %mask, %mask_7 : tensor<64x64xi1> loc(#loc129)
|
| 108 |
+
%equal_9 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1> loc(#loc117)
|
| 109 |
+
%equal_10 = arith.ori %equal, %equal_9 : tensor<64x64xi1> loc(#loc130)
|
| 110 |
+
scf.yield %mask_8, %equal_10 : tensor<64x64xi1>, tensor<64x64xi1> loc(#loc130)
|
| 111 |
+
} else {
|
| 112 |
+
scf.yield %mask, %equal : tensor<64x64xi1>, tensor<64x64xi1> loc(#loc47)
|
| 113 |
+
} loc(#loc39)
|
| 114 |
+
%mask_0 = tt.broadcast %b_index : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc119)
|
| 115 |
+
%mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<64x64xi32> loc(#loc119)
|
| 116 |
+
%mask_2 = arith.andi %1#1, %mask_1 : tensor<64x64xi1> loc(#loc120)
|
| 117 |
+
%mask_3 = arith.ori %1#0, %mask_2 : tensor<64x64xi1> loc(#loc121)
|
| 118 |
+
%2 = arith.select %mask_3, %a_value, %b_value : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc51)
|
| 119 |
+
%3 = tt.broadcast %b_index : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc52)
|
| 120 |
+
%4 = arith.select %mask_3, %a_index, %3 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc52)
|
| 121 |
+
tt.return %2, %4 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc53)
|
| 122 |
+
^bb1: // no predecessors
|
| 123 |
+
%5 = ub.poison : tensor<64x64xf32> loc(#loc54)
|
| 124 |
+
%6 = ub.poison : tensor<64x64xi32> loc(#loc54)
|
| 125 |
+
tt.return %5, %6 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc54)
|
| 126 |
+
} loc(#loc35)
|
| 127 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_64S__(%x: tensor<64x64xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} {
|
| 128 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_64S__(%x) : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc56)
|
| 129 |
+
%true = arith.constant true loc(#loc57)
|
| 130 |
+
tt.return %true : i1 loc(#loc57)
|
| 131 |
+
^bb1: // no predecessors
|
| 132 |
+
%1 = ub.poison : i1 loc(#loc58)
|
| 133 |
+
tt.return %1 : i1 loc(#loc58)
|
| 134 |
+
} loc(#loc55)
|
| 135 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_64S__(%x: tensor<64x64xf32> loc("x"(#loc59))) -> tensor<64x64xf32> attributes {noinline = false} {
|
| 136 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60)
|
| 137 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61)
|
| 138 |
+
%2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61)
|
| 139 |
+
%3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<64x64xf32> loc(#loc61)
|
| 140 |
+
%4 = arith.addf %x, %3 : tensor<64x64xf32> loc(#loc61)
|
| 141 |
+
tt.return %4 : tensor<64x64xf32> loc(#loc62)
|
| 142 |
+
^bb1: // no predecessors
|
| 143 |
+
%5 = ub.poison : tensor<64x64xf32> loc(#loc63)
|
| 144 |
+
tt.return %5 : tensor<64x64xf32> loc(#loc63)
|
| 145 |
+
} loc(#loc59)
|
| 146 |
+
tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
|
| 147 |
+
%false = arith.constant false loc(#loc65)
|
| 148 |
+
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc65)
|
| 149 |
+
tt.return %cst : tensor<1xi1> loc(#loc66)
|
| 150 |
+
^bb1: // no predecessors
|
| 151 |
+
%0 = ub.poison : tensor<1xi1> loc(#loc67)
|
| 152 |
+
tt.return %0 : tensor<1xi1> loc(#loc67)
|
| 153 |
+
} loc(#loc64)
|
| 154 |
+
tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_64S_i32S64_64S__(2,)cconstexpr_1_"(%value: tensor<64x64xf32> loc("value"(#loc68)), %index: tensor<64x64xi32> loc("index"(#loc68))) -> (tensor<64xf32>, tensor<64xi32>) attributes {noinline = false} {
|
| 155 |
+
%0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({
|
| 156 |
+
^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)):
|
| 157 |
+
%3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69)
|
| 158 |
+
tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69)
|
| 159 |
+
}) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc69)
|
| 160 |
+
tt.return %0#0, %0#1 : tensor<64xf32>, tensor<64xi32> loc(#loc70)
|
| 161 |
+
^bb1: // no predecessors
|
| 162 |
+
%1 = ub.poison : tensor<64xf32> loc(#loc71)
|
| 163 |
+
%2 = ub.poison : tensor<64xi32> loc(#loc71)
|
| 164 |
+
tt.return %1, %2 : tensor<64xf32>, tensor<64xi32> loc(#loc71)
|
| 165 |
+
} loc(#loc68)
|
| 166 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} {
|
| 167 |
+
%mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127)
|
| 168 |
+
%equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128)
|
| 169 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38)
|
| 170 |
+
%1:2 = scf.if %0 -> (i1, i1) {
|
| 171 |
+
%a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112)
|
| 172 |
+
%b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113)
|
| 173 |
+
%mask_3 = arith.constant true loc(#loc114)
|
| 174 |
+
%mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114)
|
| 175 |
+
%mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115)
|
| 176 |
+
%mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129)
|
| 177 |
+
%equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117)
|
| 178 |
+
%equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130)
|
| 179 |
+
scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130)
|
| 180 |
+
} else {
|
| 181 |
+
scf.yield %mask, %equal : i1, i1 loc(#loc47)
|
| 182 |
+
} loc(#loc39)
|
| 183 |
+
%mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119)
|
| 184 |
+
%mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120)
|
| 185 |
+
%mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121)
|
| 186 |
+
%2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51)
|
| 187 |
+
%3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52)
|
| 188 |
+
tt.return %2, %3 : f32, i32 loc(#loc53)
|
| 189 |
+
^bb1: // no predecessors
|
| 190 |
+
%4 = ub.poison : f32 loc(#loc54)
|
| 191 |
+
%5 = ub.poison : i32 loc(#loc54)
|
| 192 |
+
tt.return %4, %5 : f32, i32 loc(#loc54)
|
| 193 |
+
} loc(#loc35)
|
| 194 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} {
|
| 195 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56)
|
| 196 |
+
%true = arith.constant true loc(#loc57)
|
| 197 |
+
tt.return %true : i1 loc(#loc57)
|
| 198 |
+
^bb1: // no predecessors
|
| 199 |
+
%1 = ub.poison : i1 loc(#loc58)
|
| 200 |
+
tt.return %1 : i1 loc(#loc58)
|
| 201 |
+
} loc(#loc55)
|
| 202 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} {
|
| 203 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60)
|
| 204 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61)
|
| 205 |
+
%2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61)
|
| 206 |
+
%3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61)
|
| 207 |
+
tt.return %3 : tensor<1xf32> loc(#loc62)
|
| 208 |
+
^bb1: // no predecessors
|
| 209 |
+
%4 = ub.poison : tensor<1xf32> loc(#loc63)
|
| 210 |
+
tt.return %4 : tensor<1xf32> loc(#loc63)
|
| 211 |
+
} loc(#loc59)
|
| 212 |
+
} loc(#loc)
|
| 213 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":19:15)
|
| 214 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28)
|
| 215 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33)
|
| 216 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36)
|
| 217 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44)
|
| 218 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23)
|
| 219 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21)
|
| 220 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:27)
|
| 221 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37)
|
| 222 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19)
|
| 223 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19)
|
| 224 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55)
|
| 225 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58)
|
| 226 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40)
|
| 227 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31)
|
| 228 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29)
|
| 229 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47)
|
| 230 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41)
|
| 231 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56)
|
| 232 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52)
|
| 233 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34)
|
| 234 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71)
|
| 235 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61)
|
| 236 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38)
|
| 237 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:35)
|
| 238 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54)
|
| 239 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:41)
|
| 240 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66)
|
| 241 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8)
|
| 242 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75)
|
| 243 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20)
|
| 244 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25)
|
| 245 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36)
|
| 246 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4)
|
| 247 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 248 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 249 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19)
|
| 250 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7)
|
| 251 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 252 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 253 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 254 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 255 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 256 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 257 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 258 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 259 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 260 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 261 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 262 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 263 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11)
|
| 264 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4)
|
| 265 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
|
| 266 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
|
| 267 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
|
| 268 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
|
| 269 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
|
| 270 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
|
| 271 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
|
| 272 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
|
| 273 |
+
#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
|
| 274 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
|
| 275 |
+
#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
|
| 276 |
+
#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 277 |
+
#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11)
|
| 278 |
+
#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4)
|
| 279 |
+
#loc78 = loc("r0_numel"(#loc1))
|
| 280 |
+
#loc79 = loc("xoffset"(#loc2))
|
| 281 |
+
#loc80 = loc("xoffset"(#loc3))
|
| 282 |
+
#loc81 = loc("xindex"(#loc4))
|
| 283 |
+
#loc82 = loc("xindex"(#loc5))
|
| 284 |
+
#loc83 = loc("xindex"(#loc6))
|
| 285 |
+
#loc84 = loc("xmask"(#loc7))
|
| 286 |
+
#loc85 = loc("r0_base"(#loc8))
|
| 287 |
+
#loc86 = loc("r0_base"(#loc9))
|
| 288 |
+
#loc87 = loc("x0"(#loc10))
|
| 289 |
+
#loc88 = loc("x1"(#loc11))
|
| 290 |
+
#loc89 = loc("_tmp2"(#loc12))
|
| 291 |
+
#loc90 = loc("_tmp2_index"(#loc13))
|
| 292 |
+
#loc91 = loc("_tmp2"(#loc14))
|
| 293 |
+
#loc92 = loc("r0_index"(#loc15))
|
| 294 |
+
#loc93 = loc("r0_mask"(#loc16))
|
| 295 |
+
#loc94 = loc("tmp0"(#loc17))
|
| 296 |
+
#loc95 = loc("tmp0"(#loc18))
|
| 297 |
+
#loc96 = loc("tmp0"(#loc19))
|
| 298 |
+
#loc97 = loc("tmp0"(#loc20))
|
| 299 |
+
#loc98 = loc("tmp0"(#loc21))
|
| 300 |
+
#loc99 = loc("tmp0"(#loc22))
|
| 301 |
+
#loc100 = loc("tmp0"(#loc23))
|
| 302 |
+
#loc101 = loc("_tmp2"(#loc25))
|
| 303 |
+
#loc102 = loc("_tmp2"(#loc26))
|
| 304 |
+
#loc103 = loc("_tmp2_index"(#loc27))
|
| 305 |
+
#loc104 = loc("_tmp2_index"(#loc28))
|
| 306 |
+
#loc105 = loc("tmp2"(#loc31))
|
| 307 |
+
#loc110 = loc("mask"(#loc36))
|
| 308 |
+
#loc111 = loc("equal"(#loc37))
|
| 309 |
+
#loc112 = loc("a_isnan"(#loc40))
|
| 310 |
+
#loc113 = loc("b_isnan"(#loc41))
|
| 311 |
+
#loc114 = loc("mask"(#loc42))
|
| 312 |
+
#loc115 = loc("mask"(#loc43))
|
| 313 |
+
#loc116 = loc("mask"(#loc44))
|
| 314 |
+
#loc117 = loc("equal"(#loc45))
|
| 315 |
+
#loc118 = loc("equal"(#loc46))
|
| 316 |
+
#loc119 = loc("mask"(#loc48))
|
| 317 |
+
#loc120 = loc("mask"(#loc49))
|
| 318 |
+
#loc121 = loc("mask"(#loc50))
|
| 319 |
+
#loc126 = loc("_tmp2_index"(#loc91))
|
| 320 |
+
#loc127 = loc("mask"(#loc110))
|
| 321 |
+
#loc128 = loc("equal"(#loc111))
|
| 322 |
+
#loc129 = loc("mask"(#loc116))
|
| 323 |
+
#loc130 = loc("equal"(#loc118))
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttgir
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 2], order = [1, 0]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75)
|
| 6 |
+
#loc44 = loc("in_ptr0"(#loc))
|
| 7 |
+
#loc45 = loc("out_ptr0"(#loc))
|
| 8 |
+
#loc46 = loc("ks0"(#loc))
|
| 9 |
+
#loc47 = loc("ks1"(#loc))
|
| 10 |
+
#loc48 = loc("xnumel"(#loc))
|
| 11 |
+
#loc49 = loc("r0_numel"(#loc))
|
| 12 |
+
#loc85 = loc(callsite(#loc1 at #loc39))
|
| 13 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 14 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 15 |
+
%cst = arith.constant dense<32000> : tensor<64x1xi64, #blocked> loc(#loc1)
|
| 16 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1)
|
| 17 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 18 |
+
%c32000_i32 = arith.constant 32000 : i32 loc(#loc1)
|
| 19 |
+
%cst_1 = arith.constant dense<true> : tensor<64x64xi1, #blocked> loc(#loc1)
|
| 20 |
+
%true = arith.constant true loc(#loc1)
|
| 21 |
+
%cst_2 = arith.constant dense<32000> : tensor<1x64xi32, #blocked> loc(#loc1)
|
| 22 |
+
%cst_3 = arith.constant dense<2147483647> : tensor<64x64xi32, #blocked> loc(#loc1)
|
| 23 |
+
%cst_4 = arith.constant dense<0xFF800000> : tensor<64x64xf32, #blocked> loc(#loc1)
|
| 24 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 25 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc50)
|
| 26 |
+
%xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc51)
|
| 27 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc52)
|
| 28 |
+
%xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc52)
|
| 29 |
+
%xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc52)
|
| 30 |
+
%xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc52)
|
| 31 |
+
%xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc53)
|
| 32 |
+
%xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc53)
|
| 33 |
+
%xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc53)
|
| 34 |
+
%xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc53)
|
| 35 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc54)
|
| 36 |
+
%xmask_13 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc54)
|
| 37 |
+
%xmask_14 = arith.cmpi slt, %xindex_11, %xmask : tensor<64x1xi32, #blocked> loc(#loc54)
|
| 38 |
+
%xmask_15 = arith.cmpi slt, %xindex_12, %xmask_13 : tensor<64x1xi32, #blocked1> loc(#loc54)
|
| 39 |
+
%r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc55)
|
| 40 |
+
%r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc55)
|
| 41 |
+
%x0 = arith.extsi %xindex_11 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc56)
|
| 42 |
+
%x0_17 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc56)
|
| 43 |
+
%x0_18 = arith.remsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc56)
|
| 44 |
+
%x1 = arith.divsi %x0, %x0_17 : tensor<64x1xi64, #blocked> loc(#loc57)
|
| 45 |
+
%tmp0 = arith.muli %x0_18, %cst : tensor<64x1xi64, #blocked> loc(#loc58)
|
| 46 |
+
%tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc59)
|
| 47 |
+
%tmp0_20 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc60)
|
| 48 |
+
%tmp0_21 = arith.muli %tmp0_20, %x1 : tensor<64x1xi64, #blocked> loc(#loc60)
|
| 49 |
+
%tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc61)
|
| 50 |
+
%tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>, #blocked> loc(#loc62)
|
| 51 |
+
%tmp0_24 = tt.broadcast %xmask_14 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc63)
|
| 52 |
+
%_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c64_i32 iter_args(%_tmp2 = %cst_4, %_tmp2_index_25 = %cst_3) -> (tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked>) : i32 {
|
| 53 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc65)
|
| 54 |
+
%r0_index_26 = arith.addi %r0_index, %r0_base_16 : tensor<1x64xi32, #blocked> loc(#loc65)
|
| 55 |
+
%r0_mask = arith.cmpi slt, %r0_index_26, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc66)
|
| 56 |
+
%tmp0_27 = arith.extsi %r0_index_26 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc59)
|
| 57 |
+
%tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc59)
|
| 58 |
+
%tmp0_29 = arith.addi %tmp0_28, %tmp0_19 : tensor<64x64xi64, #blocked> loc(#loc59)
|
| 59 |
+
%tmp0_30 = arith.addi %tmp0_29, %tmp0_22 : tensor<64x64xi64, #blocked> loc(#loc61)
|
| 60 |
+
%tmp0_31 = tt.addptr %tmp0_23, %tmp0_30 : tensor<64x64x!tt.ptr<f32>, #blocked>, tensor<64x64xi64, #blocked> loc(#loc62)
|
| 61 |
+
%tmp0_32 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc63)
|
| 62 |
+
%tmp0_33 = arith.andi %tmp0_32, %tmp0_24 : tensor<64x64xi1, #blocked> loc(#loc63)
|
| 63 |
+
%tmp0_34 = tt.load %tmp0_31, %tmp0_33, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<f32>, #blocked> loc(#loc67)
|
| 64 |
+
%mask = arith.cmpf ogt, %_tmp2, %tmp0_34 : tensor<64x64xf32, #blocked> loc(#loc110)
|
| 65 |
+
%equal = arith.cmpf oeq, %_tmp2, %tmp0_34 : tensor<64x64xf32, #blocked> loc(#loc111)
|
| 66 |
+
%a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<64x64xf32, #blocked> loc(#loc90)
|
| 67 |
+
%b_isnan = arith.cmpf une, %tmp0_34, %tmp0_34 : tensor<64x64xf32, #blocked> loc(#loc91)
|
| 68 |
+
%mask_35 = arith.xori %b_isnan, %cst_1 : tensor<64x64xi1, #blocked> loc(#loc92)
|
| 69 |
+
%mask_36 = arith.andi %a_isnan, %mask_35 : tensor<64x64xi1, #blocked> loc(#loc93)
|
| 70 |
+
%mask_37 = arith.ori %mask, %mask_36 : tensor<64x64xi1, #blocked> loc(#loc112)
|
| 71 |
+
%equal_38 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1, #blocked> loc(#loc95)
|
| 72 |
+
%equal_39 = arith.ori %equal, %equal_38 : tensor<64x64xi1, #blocked> loc(#loc113)
|
| 73 |
+
%mask_40 = tt.broadcast %r0_index_26 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc97)
|
| 74 |
+
%mask_41 = arith.cmpi slt, %_tmp2_index_25, %mask_40 : tensor<64x64xi32, #blocked> loc(#loc97)
|
| 75 |
+
%mask_42 = arith.andi %equal_39, %mask_41 : tensor<64x64xi1, #blocked> loc(#loc98)
|
| 76 |
+
%mask_43 = arith.ori %mask_37, %mask_42 : tensor<64x64xi1, #blocked> loc(#loc99)
|
| 77 |
+
%5 = arith.select %mask_43, %_tmp2, %tmp0_34 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc80)
|
| 78 |
+
%6 = arith.select %mask_43, %_tmp2_index_25, %mask_40 : tensor<64x64xi1, #blocked>, tensor<64x64xi32, #blocked> loc(#loc81)
|
| 79 |
+
%_tmp2_44 = arith.select %tmp0_33, %5, %_tmp2 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc82)
|
| 80 |
+
%_tmp2_index_45 = arith.select %tmp0_33, %6, %_tmp2_index_25 : tensor<64x64xi1, #blocked>, tensor<64x64xi32, #blocked> loc(#loc83)
|
| 81 |
+
scf.yield %_tmp2_44, %_tmp2_index_45 : tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked> loc(#loc37)
|
| 82 |
+
} loc(#loc87)
|
| 83 |
+
%0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({
|
| 84 |
+
^bb0(%arg6: f32 loc(callsite(#loc1 at #loc39)), %arg7: i32 loc(callsite(#loc1 at #loc39)), %arg8: f32 loc(callsite(#loc1 at #loc39)), %arg9: i32 loc(callsite(#loc1 at #loc39))):
|
| 85 |
+
%mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc114)
|
| 86 |
+
%equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc115)
|
| 87 |
+
%a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc100)
|
| 88 |
+
%b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc101)
|
| 89 |
+
%mask_25 = arith.xori %b_isnan, %true : i1 loc(#loc102)
|
| 90 |
+
%mask_26 = arith.andi %a_isnan, %mask_25 : i1 loc(#loc103)
|
| 91 |
+
%mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc116)
|
| 92 |
+
%equal_28 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc104)
|
| 93 |
+
%equal_29 = arith.ori %equal, %equal_28 : i1 loc(#loc117)
|
| 94 |
+
%mask_30 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc105)
|
| 95 |
+
%mask_31 = arith.andi %equal_29, %mask_30 : i1 loc(#loc106)
|
| 96 |
+
%mask_32 = arith.ori %mask_27, %mask_31 : i1 loc(#loc107)
|
| 97 |
+
%5 = arith.select %mask_32, %arg6, %arg8 : f32 loc(#loc108)
|
| 98 |
+
%6 = arith.select %mask_32, %arg7, %arg9 : i32 loc(#loc109)
|
| 99 |
+
tt.reduce.return %5, %6 : f32, i32 loc(#loc84)
|
| 100 |
+
}) : (tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked>) -> (tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc84)
|
| 101 |
+
%tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc86)
|
| 102 |
+
%1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<64x1x!tt.ptr<i64>, #blocked1> loc(#loc41)
|
| 103 |
+
%2 = tt.addptr %1, %xindex_12 : tensor<64x1x!tt.ptr<i64>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc41)
|
| 104 |
+
%3 = ttg.convert_layout %tmp2 : tensor<64x1xi32, #blocked> -> tensor<64x1xi32, #blocked1> loc(#loc42)
|
| 105 |
+
%4 = arith.extsi %3 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc42)
|
| 106 |
+
tt.store %2, %4, %xmask_15 : tensor<64x1x!tt.ptr<i64>, #blocked1> loc(#loc42)
|
| 107 |
+
tt.return loc(#loc43)
|
| 108 |
+
} loc(#loc)
|
| 109 |
+
} loc(#loc)
|
| 110 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28)
|
| 111 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33)
|
| 112 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44)
|
| 113 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23)
|
| 114 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21)
|
| 115 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37)
|
| 116 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19)
|
| 117 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19)
|
| 118 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47)
|
| 119 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41)
|
| 120 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56)
|
| 121 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52)
|
| 122 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34)
|
| 123 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71)
|
| 124 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40)
|
| 125 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31)
|
| 126 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29)
|
| 127 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61)
|
| 128 |
+
#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 129 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38)
|
| 130 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 131 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 132 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 133 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 134 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 135 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 136 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 137 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 138 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 139 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 140 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 141 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 142 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 143 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54)
|
| 144 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66)
|
| 145 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8)
|
| 146 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 147 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20)
|
| 148 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25)
|
| 149 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36)
|
| 150 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4)
|
| 151 |
+
#loc50 = loc("xoffset"(#loc2))
|
| 152 |
+
#loc51 = loc("xoffset"(#loc3))
|
| 153 |
+
#loc52 = loc("xindex"(#loc4))
|
| 154 |
+
#loc53 = loc("xindex"(#loc5))
|
| 155 |
+
#loc54 = loc("xmask"(#loc6))
|
| 156 |
+
#loc55 = loc("r0_base"(#loc7))
|
| 157 |
+
#loc56 = loc("x0"(#loc8))
|
| 158 |
+
#loc57 = loc("x1"(#loc9))
|
| 159 |
+
#loc58 = loc("tmp0"(#loc10))
|
| 160 |
+
#loc59 = loc("tmp0"(#loc11))
|
| 161 |
+
#loc60 = loc("tmp0"(#loc12))
|
| 162 |
+
#loc61 = loc("tmp0"(#loc13))
|
| 163 |
+
#loc62 = loc("tmp0"(#loc14))
|
| 164 |
+
#loc63 = loc("tmp0"(#loc15))
|
| 165 |
+
#loc64 = loc("_tmp2"(#loc16))
|
| 166 |
+
#loc65 = loc("r0_index"(#loc17))
|
| 167 |
+
#loc66 = loc("r0_mask"(#loc18))
|
| 168 |
+
#loc67 = loc("tmp0"(#loc19))
|
| 169 |
+
#loc68 = loc("mask"(#loc20))
|
| 170 |
+
#loc69 = loc("equal"(#loc22))
|
| 171 |
+
#loc70 = loc("a_isnan"(#loc23))
|
| 172 |
+
#loc71 = loc("b_isnan"(#loc24))
|
| 173 |
+
#loc72 = loc("mask"(#loc25))
|
| 174 |
+
#loc73 = loc("mask"(#loc26))
|
| 175 |
+
#loc74 = loc("mask"(#loc27))
|
| 176 |
+
#loc75 = loc("equal"(#loc28))
|
| 177 |
+
#loc76 = loc("equal"(#loc29))
|
| 178 |
+
#loc77 = loc("mask"(#loc30))
|
| 179 |
+
#loc78 = loc("mask"(#loc31))
|
| 180 |
+
#loc79 = loc("mask"(#loc32))
|
| 181 |
+
#loc80 = loc(callsite(#loc33 at #loc21))
|
| 182 |
+
#loc81 = loc(callsite(#loc34 at #loc21))
|
| 183 |
+
#loc82 = loc("_tmp2"(#loc35))
|
| 184 |
+
#loc83 = loc("_tmp2_index"(#loc36))
|
| 185 |
+
#loc84 = loc(callsite(#loc38 at #loc39))
|
| 186 |
+
#loc86 = loc("tmp2"(#loc40))
|
| 187 |
+
#loc87 = loc("_tmp2_index"(#loc64))
|
| 188 |
+
#loc88 = loc("mask"(#loc68))
|
| 189 |
+
#loc89 = loc("equal"(#loc69))
|
| 190 |
+
#loc90 = loc(callsite(#loc70 at #loc21))
|
| 191 |
+
#loc91 = loc(callsite(#loc71 at #loc21))
|
| 192 |
+
#loc92 = loc(callsite(#loc72 at #loc21))
|
| 193 |
+
#loc93 = loc(callsite(#loc73 at #loc21))
|
| 194 |
+
#loc94 = loc("mask"(#loc74))
|
| 195 |
+
#loc95 = loc(callsite(#loc75 at #loc21))
|
| 196 |
+
#loc96 = loc("equal"(#loc76))
|
| 197 |
+
#loc97 = loc(callsite(#loc77 at #loc21))
|
| 198 |
+
#loc98 = loc(callsite(#loc78 at #loc21))
|
| 199 |
+
#loc99 = loc(callsite(#loc79 at #loc21))
|
| 200 |
+
#loc100 = loc(callsite(#loc70 at #loc84))
|
| 201 |
+
#loc101 = loc(callsite(#loc71 at #loc84))
|
| 202 |
+
#loc102 = loc(callsite(#loc72 at #loc84))
|
| 203 |
+
#loc103 = loc(callsite(#loc73 at #loc84))
|
| 204 |
+
#loc104 = loc(callsite(#loc75 at #loc84))
|
| 205 |
+
#loc105 = loc(callsite(#loc77 at #loc84))
|
| 206 |
+
#loc106 = loc(callsite(#loc78 at #loc84))
|
| 207 |
+
#loc107 = loc(callsite(#loc79 at #loc84))
|
| 208 |
+
#loc108 = loc(callsite(#loc33 at #loc84))
|
| 209 |
+
#loc109 = loc(callsite(#loc34 at #loc84))
|
| 210 |
+
#loc110 = loc(callsite(#loc88 at #loc21))
|
| 211 |
+
#loc111 = loc(callsite(#loc89 at #loc21))
|
| 212 |
+
#loc112 = loc(callsite(#loc94 at #loc21))
|
| 213 |
+
#loc113 = loc(callsite(#loc96 at #loc21))
|
| 214 |
+
#loc114 = loc(callsite(#loc88 at #loc84))
|
| 215 |
+
#loc115 = loc(callsite(#loc89 at #loc84))
|
| 216 |
+
#loc116 = loc(callsite(#loc94 at #loc84))
|
| 217 |
+
#loc117 = loc(callsite(#loc96 at #loc84))
|
SpecForge-ext/cache/compiled_kernels/triton/7/BZGMO2ROUNNVUMEUFCQSSF6FGNZS2MEIPAEECDHYOJVVX3KB75AA/triton_red_fused_argmax_1.ttir
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":45:75)
|
| 4 |
+
#loc47 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc48 = loc("out_ptr0"(#loc))
|
| 6 |
+
#loc49 = loc("ks0"(#loc))
|
| 7 |
+
#loc50 = loc("ks1"(#loc))
|
| 8 |
+
#loc51 = loc("xnumel"(#loc))
|
| 9 |
+
#loc52 = loc("r0_numel"(#loc))
|
| 10 |
+
#loc53 = loc(callsite(#loc1 at #loc2))
|
| 11 |
+
module {
|
| 12 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 13 |
+
%true = arith.constant true loc(#loc53)
|
| 14 |
+
%cst = arith.constant dense<true> : tensor<64x64xi1> loc(#loc1)
|
| 15 |
+
%c32000_i32 = arith.constant 32000 : i32 loc(#loc3)
|
| 16 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
|
| 17 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1)
|
| 18 |
+
%cst_1 = arith.constant dense<32000> : tensor<64x1xi64> loc(#loc1)
|
| 19 |
+
%cst_2 = arith.constant dense<32000> : tensor<1x64xi32> loc(#loc1)
|
| 20 |
+
%_tmp2_index = arith.constant dense<2147483647> : tensor<64x64xi32> loc(#loc54)
|
| 21 |
+
%_tmp2 = arith.constant dense<0xFF800000> : tensor<64x64xf32> loc(#loc55)
|
| 22 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 23 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc56)
|
| 24 |
+
%xoffset_3 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc57)
|
| 25 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc58)
|
| 26 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc59)
|
| 27 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc60)
|
| 28 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc60)
|
| 29 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc61)
|
| 30 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc61)
|
| 31 |
+
%r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc62)
|
| 32 |
+
%x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc63)
|
| 33 |
+
%x0_8 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc63)
|
| 34 |
+
%x0_9 = arith.remsi %x0, %x0_8 : tensor<64x1xi64> loc(#loc63)
|
| 35 |
+
%x1 = arith.divsi %x0, %x0_8 : tensor<64x1xi64> loc(#loc64)
|
| 36 |
+
%_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c64_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<64x64xf32>, tensor<64x64xi32>) : i32 {
|
| 37 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc66)
|
| 38 |
+
%r0_index_13 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc66)
|
| 39 |
+
%r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x64xi32> loc(#loc67)
|
| 40 |
+
%tmp0 = arith.muli %x0_9, %cst_1 : tensor<64x1xi64> loc(#loc68)
|
| 41 |
+
%tmp0_14 = arith.extsi %r0_index_13 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc69)
|
| 42 |
+
%tmp0_15 = tt.broadcast %tmp0_14 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc69)
|
| 43 |
+
%tmp0_16 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc69)
|
| 44 |
+
%tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<64x64xi64> loc(#loc69)
|
| 45 |
+
%tmp0_18 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc70)
|
| 46 |
+
%tmp0_19 = arith.muli %tmp0_18, %x1 : tensor<64x1xi64> loc(#loc70)
|
| 47 |
+
%tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc71)
|
| 48 |
+
%tmp0_21 = arith.addi %tmp0_17, %tmp0_20 : tensor<64x64xi64> loc(#loc71)
|
| 49 |
+
%tmp0_22 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc72)
|
| 50 |
+
%tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi64> loc(#loc72)
|
| 51 |
+
%tmp0_24 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc73)
|
| 52 |
+
%tmp0_25 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc73)
|
| 53 |
+
%tmp0_26 = arith.andi %tmp0_24, %tmp0_25 : tensor<64x64xi1> loc(#loc73)
|
| 54 |
+
%tmp0_27 = tt.load %tmp0_23, %tmp0_26, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<f32>> loc(#loc74)
|
| 55 |
+
%mask = arith.cmpf ogt, %_tmp2_11, %tmp0_27 : tensor<64x64xf32> loc(#loc116)
|
| 56 |
+
%equal = arith.cmpf oeq, %_tmp2_11, %tmp0_27 : tensor<64x64xf32> loc(#loc117)
|
| 57 |
+
%a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<64x64xf32> loc(#loc96)
|
| 58 |
+
%b_isnan = arith.cmpf une, %tmp0_27, %tmp0_27 : tensor<64x64xf32> loc(#loc97)
|
| 59 |
+
%mask_28 = arith.xori %b_isnan, %cst : tensor<64x64xi1> loc(#loc98)
|
| 60 |
+
%mask_29 = arith.andi %a_isnan, %mask_28 : tensor<64x64xi1> loc(#loc99)
|
| 61 |
+
%mask_30 = arith.ori %mask, %mask_29 : tensor<64x64xi1> loc(#loc118)
|
| 62 |
+
%equal_31 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1> loc(#loc101)
|
| 63 |
+
%equal_32 = arith.ori %equal, %equal_31 : tensor<64x64xi1> loc(#loc119)
|
| 64 |
+
%mask_33 = tt.broadcast %r0_index_13 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc103)
|
| 65 |
+
%mask_34 = arith.cmpi slt, %_tmp2_index_12, %mask_33 : tensor<64x64xi32> loc(#loc103)
|
| 66 |
+
%mask_35 = arith.andi %equal_32, %mask_34 : tensor<64x64xi1> loc(#loc104)
|
| 67 |
+
%mask_36 = arith.ori %mask_30, %mask_35 : tensor<64x64xi1> loc(#loc105)
|
| 68 |
+
%4 = arith.select %mask_36, %_tmp2_11, %tmp0_27 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc87)
|
| 69 |
+
%5 = arith.select %mask_36, %_tmp2_index_12, %mask_33 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc88)
|
| 70 |
+
%_tmp2_37 = arith.select %tmp0_26, %4, %_tmp2_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc89)
|
| 71 |
+
%_tmp2_index_38 = arith.select %tmp0_26, %5, %_tmp2_index_12 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc90)
|
| 72 |
+
scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc41)
|
| 73 |
+
} loc(#loc93)
|
| 74 |
+
%0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({
|
| 75 |
+
^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))):
|
| 76 |
+
%mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc120)
|
| 77 |
+
%equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc121)
|
| 78 |
+
%a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc106)
|
| 79 |
+
%b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc107)
|
| 80 |
+
%mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc108)
|
| 81 |
+
%mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc109)
|
| 82 |
+
%mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc122)
|
| 83 |
+
%equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc110)
|
| 84 |
+
%equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc123)
|
| 85 |
+
%mask_16 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc111)
|
| 86 |
+
%mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc112)
|
| 87 |
+
%mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc113)
|
| 88 |
+
%4 = arith.select %mask_18, %arg6, %arg8 : f32 loc(#loc114)
|
| 89 |
+
%5 = arith.select %mask_18, %arg7, %arg9 : i32 loc(#loc115)
|
| 90 |
+
tt.reduce.return %4, %5 : f32, i32 loc(#loc91)
|
| 91 |
+
}) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc91)
|
| 92 |
+
%tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc92)
|
| 93 |
+
%1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<64x1x!tt.ptr<i64>> loc(#loc44)
|
| 94 |
+
%2 = tt.addptr %1, %xindex_6 : tensor<64x1x!tt.ptr<i64>>, tensor<64x1xi32> loc(#loc44)
|
| 95 |
+
%3 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc45)
|
| 96 |
+
tt.store %2, %3, %xmask_7 : tensor<64x1x!tt.ptr<i64>> loc(#loc45)
|
| 97 |
+
tt.return loc(#loc46)
|
| 98 |
+
} loc(#loc)
|
| 99 |
+
} loc(#loc)
|
| 100 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":32:40)
|
| 101 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":30:58)
|
| 102 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":29:55)
|
| 103 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:28)
|
| 104 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":22:33)
|
| 105 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:36)
|
| 106 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:44)
|
| 107 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":23:23)
|
| 108 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":24:21)
|
| 109 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":25:37)
|
| 110 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":27:19)
|
| 111 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":28:19)
|
| 112 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":33:31)
|
| 113 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":34:29)
|
| 114 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:47)
|
| 115 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:41)
|
| 116 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:56)
|
| 117 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:52)
|
| 118 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:34)
|
| 119 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:71)
|
| 120 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":38:61)
|
| 121 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 122 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":41:38)
|
| 123 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 124 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 125 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 126 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 127 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 128 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 129 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 130 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 131 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 132 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 133 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 134 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 135 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 136 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":43:54)
|
| 137 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:66)
|
| 138 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":44:8)
|
| 139 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 140 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":46:20)
|
| 141 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:25)
|
| 142 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:36)
|
| 143 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4wdhwlu6yb3wcwazdnzmgzewiemvznxvrr3525eojupqjldo5pt.py":47:4)
|
| 144 |
+
#loc54 = loc("_tmp2_index"(#loc4))
|
| 145 |
+
#loc55 = loc("_tmp2"(#loc5))
|
| 146 |
+
#loc56 = loc("xoffset"(#loc6))
|
| 147 |
+
#loc57 = loc("xoffset"(#loc7))
|
| 148 |
+
#loc58 = loc("xindex"(#loc8))
|
| 149 |
+
#loc59 = loc("xindex"(#loc9))
|
| 150 |
+
#loc60 = loc("xindex"(#loc10))
|
| 151 |
+
#loc61 = loc("xmask"(#loc11))
|
| 152 |
+
#loc62 = loc("r0_base"(#loc12))
|
| 153 |
+
#loc63 = loc("x0"(#loc13))
|
| 154 |
+
#loc64 = loc("x1"(#loc14))
|
| 155 |
+
#loc65 = loc("_tmp2"(#loc3))
|
| 156 |
+
#loc66 = loc("r0_index"(#loc15))
|
| 157 |
+
#loc67 = loc("r0_mask"(#loc16))
|
| 158 |
+
#loc68 = loc("tmp0"(#loc17))
|
| 159 |
+
#loc69 = loc("tmp0"(#loc18))
|
| 160 |
+
#loc70 = loc("tmp0"(#loc19))
|
| 161 |
+
#loc71 = loc("tmp0"(#loc20))
|
| 162 |
+
#loc72 = loc("tmp0"(#loc21))
|
| 163 |
+
#loc73 = loc("tmp0"(#loc22))
|
| 164 |
+
#loc74 = loc("tmp0"(#loc23))
|
| 165 |
+
#loc75 = loc("mask"(#loc24))
|
| 166 |
+
#loc76 = loc("equal"(#loc26))
|
| 167 |
+
#loc77 = loc("a_isnan"(#loc27))
|
| 168 |
+
#loc78 = loc("b_isnan"(#loc28))
|
| 169 |
+
#loc79 = loc("mask"(#loc29))
|
| 170 |
+
#loc80 = loc("mask"(#loc30))
|
| 171 |
+
#loc81 = loc("mask"(#loc31))
|
| 172 |
+
#loc82 = loc("equal"(#loc32))
|
| 173 |
+
#loc83 = loc("equal"(#loc33))
|
| 174 |
+
#loc84 = loc("mask"(#loc34))
|
| 175 |
+
#loc85 = loc("mask"(#loc35))
|
| 176 |
+
#loc86 = loc("mask"(#loc36))
|
| 177 |
+
#loc87 = loc(callsite(#loc37 at #loc25))
|
| 178 |
+
#loc88 = loc(callsite(#loc38 at #loc25))
|
| 179 |
+
#loc89 = loc("_tmp2"(#loc39))
|
| 180 |
+
#loc90 = loc("_tmp2_index"(#loc40))
|
| 181 |
+
#loc91 = loc(callsite(#loc42 at #loc2))
|
| 182 |
+
#loc92 = loc("tmp2"(#loc43))
|
| 183 |
+
#loc93 = loc("_tmp2_index"(#loc65))
|
| 184 |
+
#loc94 = loc("mask"(#loc75))
|
| 185 |
+
#loc95 = loc("equal"(#loc76))
|
| 186 |
+
#loc96 = loc(callsite(#loc77 at #loc25))
|
| 187 |
+
#loc97 = loc(callsite(#loc78 at #loc25))
|
| 188 |
+
#loc98 = loc(callsite(#loc79 at #loc25))
|
| 189 |
+
#loc99 = loc(callsite(#loc80 at #loc25))
|
| 190 |
+
#loc100 = loc("mask"(#loc81))
|
| 191 |
+
#loc101 = loc(callsite(#loc82 at #loc25))
|
| 192 |
+
#loc102 = loc("equal"(#loc83))
|
| 193 |
+
#loc103 = loc(callsite(#loc84 at #loc25))
|
| 194 |
+
#loc104 = loc(callsite(#loc85 at #loc25))
|
| 195 |
+
#loc105 = loc(callsite(#loc86 at #loc25))
|
| 196 |
+
#loc106 = loc(callsite(#loc77 at #loc91))
|
| 197 |
+
#loc107 = loc(callsite(#loc78 at #loc91))
|
| 198 |
+
#loc108 = loc(callsite(#loc79 at #loc91))
|
| 199 |
+
#loc109 = loc(callsite(#loc80 at #loc91))
|
| 200 |
+
#loc110 = loc(callsite(#loc82 at #loc91))
|
| 201 |
+
#loc111 = loc(callsite(#loc84 at #loc91))
|
| 202 |
+
#loc112 = loc(callsite(#loc85 at #loc91))
|
| 203 |
+
#loc113 = loc(callsite(#loc86 at #loc91))
|
| 204 |
+
#loc114 = loc(callsite(#loc37 at #loc91))
|
| 205 |
+
#loc115 = loc(callsite(#loc38 at #loc91))
|
| 206 |
+
#loc116 = loc(callsite(#loc94 at #loc25))
|
| 207 |
+
#loc117 = loc(callsite(#loc95 at #loc25))
|
| 208 |
+
#loc118 = loc(callsite(#loc100 at #loc25))
|
| 209 |
+
#loc119 = loc(callsite(#loc102 at #loc25))
|
| 210 |
+
#loc120 = loc(callsite(#loc94 at #loc91))
|
| 211 |
+
#loc121 = loc(callsite(#loc95 at #loc91))
|
| 212 |
+
#loc122 = loc(callsite(#loc100 at #loc91))
|
| 213 |
+
#loc123 = loc(callsite(#loc102 at #loc91))
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin
ADDED
|
Binary file (28.9 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%15 = icmp slt i32 %14, %9, !dbg !8
|
| 11 |
+
%16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%17 = and i32 %16, 384, !dbg !9
|
| 13 |
+
%18 = zext nneg i32 %14 to i64, !dbg !10
|
| 14 |
+
%.frozen = freeze i64 %3, !dbg !10
|
| 15 |
+
%19 = sdiv i64 %18, %.frozen, !dbg !10
|
| 16 |
+
%20 = srem i64 %19, %4, !dbg !11
|
| 17 |
+
%21 = mul i64 %19, %.frozen, !dbg !12
|
| 18 |
+
%.decomposed = sub i64 %18, %21, !dbg !12
|
| 19 |
+
%22 = sdiv i64 %18, %7, !dbg !13
|
| 20 |
+
%23 = shl nsw i64 %20, 7, !dbg !14
|
| 21 |
+
%24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15
|
| 22 |
+
%25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16
|
| 23 |
+
%26 = and i32 %16, 127
|
| 24 |
+
%27 = zext nneg i32 %26 to i64
|
| 25 |
+
%28 = or disjoint i64 %24, %27
|
| 26 |
+
%29 = icmp slt i64 %28, %6
|
| 27 |
+
%30 = icmp sge i64 %28, %8
|
| 28 |
+
%31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0)
|
| 29 |
+
%32 = sub nsw i64 %.decomposed, %20
|
| 30 |
+
%33 = shl nsw i64 %32, 7
|
| 31 |
+
%34 = zext nneg i32 %17 to i64, !dbg !17
|
| 32 |
+
%35 = zext nneg i32 %26 to i64, !dbg !17
|
| 33 |
+
%36 = zext nneg i32 %16 to i64, !dbg !17
|
| 34 |
+
%37 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !18
|
| 35 |
+
%38 = shufflevector <2 x i1> %37, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !18
|
| 36 |
+
%39 = insertelement <2 x i1> poison, i1 %29, i64 0, !dbg !19
|
| 37 |
+
%40 = shufflevector <2 x i1> %39, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !19
|
| 38 |
+
%41 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !20
|
| 39 |
+
%42 = shufflevector <2 x i64> %41, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !20
|
| 40 |
+
%43 = insertelement <2 x i64> poison, i64 %5, i64 0, !dbg !21
|
| 41 |
+
%44 = shufflevector <2 x i64> %43, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !21
|
| 42 |
+
%45 = insertelement <2 x i64> poison, i64 %28, i64 0, !dbg !22
|
| 43 |
+
%46 = shufflevector <2 x i64> %45, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !22
|
| 44 |
+
%47 = insertelement <2 x i1> poison, i1 %30, i64 0, !dbg !23
|
| 45 |
+
%48 = shufflevector <2 x i1> %47, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23
|
| 46 |
+
%49 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !24
|
| 47 |
+
%50 = shufflevector <2 x i64> %49, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !24
|
| 48 |
+
%51 = insertelement <2 x i64> poison, i64 %8, i64 0, !dbg !25
|
| 49 |
+
%52 = shufflevector <2 x i64> %51, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !25
|
| 50 |
+
br label %53, !dbg !17
|
| 51 |
+
|
| 52 |
+
53: ; preds = %13, %53
|
| 53 |
+
%indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ]
|
| 54 |
+
%54 = phi <2 x i64> [ zeroinitializer, %13 ], [ %113, %53 ]
|
| 55 |
+
%55 = or disjoint i64 %indvars.iv, %34, !dbg !26
|
| 56 |
+
%56 = or disjoint i64 %indvars.iv, %36, !dbg !26
|
| 57 |
+
%57 = lshr exact i64 %55, 7, !dbg !27
|
| 58 |
+
%58 = lshr i64 %56, 7, !dbg !27
|
| 59 |
+
%59 = trunc nuw nsw i64 %58 to i32, !dbg !27
|
| 60 |
+
%60 = or i32 %59, 4, !dbg !27
|
| 61 |
+
%61 = zext nneg i32 %60 to i64, !dbg !20
|
| 62 |
+
%62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
|
| 63 |
+
%63 = sub nsw i64 %35, %57, !dbg !29
|
| 64 |
+
%64 = sub nsw i32 %26, %60, !dbg !29
|
| 65 |
+
%65 = sext i32 %64 to i64, !dbg !30
|
| 66 |
+
%66 = insertelement <2 x i64> poison, i64 %57, i64 0, !dbg !20
|
| 67 |
+
%67 = insertelement <2 x i64> %66, i64 %61, i64 1, !dbg !20
|
| 68 |
+
%68 = or disjoint <2 x i64> %42, %67, !dbg !20
|
| 69 |
+
%69 = icmp slt <2 x i64> %68, %44, !dbg !21
|
| 70 |
+
%70 = and <2 x i1> %40, %69, !dbg !19
|
| 71 |
+
%71 = icmp sge <2 x i64> %68, %46, !dbg !22
|
| 72 |
+
%72 = extractelement <2 x i1> %70, i64 0, !dbg !31
|
| 73 |
+
%73 = and i1 %15, %72, !dbg !31
|
| 74 |
+
%74 = extractelement <2 x i1> %70, i64 1, !dbg !31
|
| 75 |
+
%75 = and i1 %15, %74, !dbg !31
|
| 76 |
+
%76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %62, i1 %73) #5, !dbg !28
|
| 77 |
+
%77 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
|
| 78 |
+
%78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %77, i1 %75) #5, !dbg !28
|
| 79 |
+
%79 = insertelement <2 x i64> poison, i64 %76, i64 0, !dbg !32
|
| 80 |
+
%80 = insertelement <2 x i64> %79, i64 %78, i64 1, !dbg !32
|
| 81 |
+
%81 = icmp slt <2 x i64> %46, %80, !dbg !32
|
| 82 |
+
%82 = icmp slt <2 x i64> %68, %80, !dbg !33
|
| 83 |
+
%83 = and <2 x i1> %81, %82, !dbg !34
|
| 84 |
+
%84 = and <2 x i1> %71, %83, !dbg !35
|
| 85 |
+
%85 = srem i64 %28, %8, !dbg !36
|
| 86 |
+
%.not = icmp eq i64 %85, 0, !dbg !37
|
| 87 |
+
%86 = select i1 %.not, i64 0, i64 %31, !dbg !38
|
| 88 |
+
%87 = add nsw i64 %86, %85, !dbg !38
|
| 89 |
+
%88 = insertelement <2 x i64> poison, i64 %87, i64 0, !dbg !39
|
| 90 |
+
%89 = shufflevector <2 x i64> %88, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !39
|
| 91 |
+
%90 = icmp slt <2 x i64> %89, %80, !dbg !39
|
| 92 |
+
%91 = insertelement <2 x i64> poison, i64 %63, i64 0, !dbg !24
|
| 93 |
+
%92 = insertelement <2 x i64> %91, i64 %65, i64 1, !dbg !24
|
| 94 |
+
%93 = add nsw <2 x i64> %50, %92, !dbg !24
|
| 95 |
+
%94 = srem <2 x i64> %93, %52, !dbg !25
|
| 96 |
+
%95 = icmp ne <2 x i64> %94, zeroinitializer, !dbg !40
|
| 97 |
+
%96 = extractelement <2 x i64> %94, i64 0, !dbg !41
|
| 98 |
+
%97 = xor i64 %96, %8, !dbg !41
|
| 99 |
+
%98 = extractelement <2 x i64> %94, i64 1, !dbg !41
|
| 100 |
+
%99 = xor i64 %98, %8, !dbg !41
|
| 101 |
+
%100 = insertelement <2 x i64> poison, i64 %97, i64 0, !dbg !41
|
| 102 |
+
%101 = insertelement <2 x i64> %100, i64 %99, i64 1, !dbg !41
|
| 103 |
+
%102 = icmp slt <2 x i64> %101, zeroinitializer, !dbg !41
|
| 104 |
+
%103 = and <2 x i1> %95, %102, !dbg !42
|
| 105 |
+
%104 = select <2 x i1> %103, <2 x i64> %52, <2 x i64> zeroinitializer, !dbg !43
|
| 106 |
+
%105 = sub <2 x i64> zeroinitializer, %104, !dbg !44
|
| 107 |
+
%106 = icmp eq <2 x i64> %94, %105, !dbg !44
|
| 108 |
+
%107 = and <2 x i1> %90, %106, !dbg !23
|
| 109 |
+
%108 = and <2 x i1> %48, %107, !dbg !23
|
| 110 |
+
%109 = or <2 x i1> %84, %108, !dbg !45
|
| 111 |
+
%110 = select <2 x i1> %38, <2 x i1> %70, <2 x i1> zeroinitializer, !dbg !18
|
| 112 |
+
%111 = select <2 x i1> %110, <2 x i1> %109, <2 x i1> zeroinitializer, !dbg !18
|
| 113 |
+
%112 = zext <2 x i1> %111 to <2 x i64>, !dbg !18
|
| 114 |
+
%113 = add <2 x i64> %54, %112, !dbg !18
|
| 115 |
+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1024, !dbg !17
|
| 116 |
+
%114 = icmp samesign ult i64 %indvars.iv, 15360, !dbg !17
|
| 117 |
+
br i1 %114, label %53, label %115, !dbg !17
|
| 118 |
+
|
| 119 |
+
115: ; preds = %53
|
| 120 |
+
%116 = and i32 %16, 31, !dbg !9
|
| 121 |
+
%117 = lshr i32 %16, 5, !dbg !9
|
| 122 |
+
%shift = shufflevector <2 x i64> %113, <2 x i64> poison, <2 x i32> <i32 1, i32 poison>, !dbg !46
|
| 123 |
+
%foldExtExtBinop = add <2 x i64> %113, %shift, !dbg !46
|
| 124 |
+
%118 = extractelement <2 x i64> %foldExtExtBinop, i64 0, !dbg !46
|
| 125 |
+
%119 = bitcast <2 x i64> %foldExtExtBinop to <4 x i32>, !dbg !50
|
| 126 |
+
%120 = extractelement <4 x i32> %119, i64 1, !dbg !50
|
| 127 |
+
%121 = trunc i64 %118 to i32, !dbg !50
|
| 128 |
+
%122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !50
|
| 129 |
+
%123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !50
|
| 130 |
+
%124 = insertelement <2 x i32> poison, i32 %122, i64 0, !dbg !50
|
| 131 |
+
%125 = insertelement <2 x i32> %124, i32 %123, i64 1, !dbg !50
|
| 132 |
+
%126 = bitcast <2 x i32> %125 to i64, !dbg !50
|
| 133 |
+
%127 = add i64 %118, %126, !dbg !46
|
| 134 |
+
%extelt.offset1 = lshr i64 %127, 32, !dbg !50
|
| 135 |
+
%128 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50
|
| 136 |
+
%129 = trunc i64 %127 to i32, !dbg !50
|
| 137 |
+
%130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !50
|
| 138 |
+
%131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !50
|
| 139 |
+
%132 = insertelement <2 x i32> poison, i32 %130, i64 0, !dbg !50
|
| 140 |
+
%133 = insertelement <2 x i32> %132, i32 %131, i64 1, !dbg !50
|
| 141 |
+
%134 = bitcast <2 x i32> %133 to i64, !dbg !50
|
| 142 |
+
%135 = add i64 %127, %134, !dbg !46
|
| 143 |
+
%extelt.offset2 = lshr i64 %135, 32, !dbg !50
|
| 144 |
+
%136 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50
|
| 145 |
+
%137 = trunc i64 %135 to i32, !dbg !50
|
| 146 |
+
%138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 4, i32 31), !dbg !50
|
| 147 |
+
%139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !50
|
| 148 |
+
%140 = insertelement <2 x i32> poison, i32 %138, i64 0, !dbg !50
|
| 149 |
+
%141 = insertelement <2 x i32> %140, i32 %139, i64 1, !dbg !50
|
| 150 |
+
%142 = bitcast <2 x i32> %141 to i64, !dbg !50
|
| 151 |
+
%143 = add i64 %135, %142, !dbg !46
|
| 152 |
+
%extelt.offset3 = lshr i64 %143, 32, !dbg !50
|
| 153 |
+
%144 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50
|
| 154 |
+
%145 = trunc i64 %143 to i32, !dbg !50
|
| 155 |
+
%146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 2, i32 31), !dbg !50
|
| 156 |
+
%147 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 2, i32 31), !dbg !50
|
| 157 |
+
%148 = insertelement <2 x i32> poison, i32 %146, i64 0, !dbg !50
|
| 158 |
+
%149 = insertelement <2 x i32> %148, i32 %147, i64 1, !dbg !50
|
| 159 |
+
%150 = bitcast <2 x i32> %149 to i64, !dbg !50
|
| 160 |
+
%151 = add i64 %143, %150, !dbg !46
|
| 161 |
+
%extelt.offset4 = lshr i64 %151, 32, !dbg !50
|
| 162 |
+
%152 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50
|
| 163 |
+
%153 = trunc i64 %151 to i32, !dbg !50
|
| 164 |
+
%154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 1, i32 31), !dbg !50
|
| 165 |
+
%155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !50
|
| 166 |
+
%156 = insertelement <2 x i32> poison, i32 %154, i64 0, !dbg !50
|
| 167 |
+
%157 = insertelement <2 x i32> %156, i32 %155, i64 1, !dbg !50
|
| 168 |
+
%158 = bitcast <2 x i32> %157 to i64, !dbg !50
|
| 169 |
+
%159 = add i64 %151, %158, !dbg !46
|
| 170 |
+
%160 = and i32 %117, 15, !dbg !50
|
| 171 |
+
%161 = icmp eq i32 %116, 0, !dbg !50
|
| 172 |
+
%162 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %160, !dbg !50
|
| 173 |
+
%163 = insertelement <1 x i64> poison, i64 %159, i64 0, !dbg !50
|
| 174 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %162, <1 x i64> %163, i1 %161) #5, !dbg !50
|
| 175 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50
|
| 176 |
+
%164 = icmp samesign ult i32 %16, 16, !dbg !50
|
| 177 |
+
%165 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50
|
| 178 |
+
%166 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %165, i1 %164) #5, !dbg !50
|
| 179 |
+
%extelt.offset5 = lshr i64 %166, 32, !dbg !50
|
| 180 |
+
%167 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50
|
| 181 |
+
%168 = trunc i64 %166 to i32, !dbg !50
|
| 182 |
+
%169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 8, i32 31), !dbg !50
|
| 183 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 8, i32 31), !dbg !50
|
| 184 |
+
%171 = insertelement <2 x i32> poison, i32 %169, i64 0, !dbg !50
|
| 185 |
+
%172 = insertelement <2 x i32> %171, i32 %170, i64 1, !dbg !50
|
| 186 |
+
%173 = bitcast <2 x i32> %172 to i64, !dbg !50
|
| 187 |
+
%174 = add i64 %166, %173, !dbg !46
|
| 188 |
+
%extelt.offset6 = lshr i64 %174, 32, !dbg !50
|
| 189 |
+
%175 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50
|
| 190 |
+
%176 = trunc i64 %174 to i32, !dbg !50
|
| 191 |
+
%177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 4, i32 31), !dbg !50
|
| 192 |
+
%178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !50
|
| 193 |
+
%179 = insertelement <2 x i32> poison, i32 %177, i64 0, !dbg !50
|
| 194 |
+
%180 = insertelement <2 x i32> %179, i32 %178, i64 1, !dbg !50
|
| 195 |
+
%181 = bitcast <2 x i32> %180 to i64, !dbg !50
|
| 196 |
+
%182 = add i64 %174, %181, !dbg !46
|
| 197 |
+
%extelt.offset7 = lshr i64 %182, 32, !dbg !50
|
| 198 |
+
%183 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50
|
| 199 |
+
%184 = trunc i64 %182 to i32, !dbg !50
|
| 200 |
+
%185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 2, i32 31), !dbg !50
|
| 201 |
+
%186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 2, i32 31), !dbg !50
|
| 202 |
+
%187 = insertelement <2 x i32> poison, i32 %185, i64 0, !dbg !50
|
| 203 |
+
%188 = insertelement <2 x i32> %187, i32 %186, i64 1, !dbg !50
|
| 204 |
+
%189 = bitcast <2 x i32> %188 to i64, !dbg !50
|
| 205 |
+
%190 = add i64 %182, %189, !dbg !46
|
| 206 |
+
%extelt.offset8 = lshr i64 %190, 32, !dbg !50
|
| 207 |
+
%191 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50
|
| 208 |
+
%192 = trunc i64 %190 to i32, !dbg !50
|
| 209 |
+
%193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !50
|
| 210 |
+
%194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 1, i32 31), !dbg !50
|
| 211 |
+
%195 = insertelement <2 x i32> poison, i32 %193, i64 0, !dbg !50
|
| 212 |
+
%196 = insertelement <2 x i32> %195, i32 %194, i64 1, !dbg !50
|
| 213 |
+
%197 = bitcast <2 x i32> %196 to i64, !dbg !50
|
| 214 |
+
%198 = add i64 %190, %197, !dbg !46
|
| 215 |
+
%199 = icmp eq i32 %16, 0, !dbg !50
|
| 216 |
+
%200 = insertelement <1 x i64> poison, i64 %198, i64 0, !dbg !50
|
| 217 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %165, <1 x i64> %200, i1 %199) #5, !dbg !50
|
| 218 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50
|
| 219 |
+
%201 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50
|
| 220 |
+
%202 = add i64 %201, -1, !dbg !51
|
| 221 |
+
%203 = icmp ult i64 %202, 16383, !dbg !51
|
| 222 |
+
%204 = zext i1 %203 to i32, !dbg !52
|
| 223 |
+
%205 = icmp eq i64 %201, 16384, !dbg !53
|
| 224 |
+
%206 = zext i1 %205 to i32, !dbg !52
|
| 225 |
+
%207 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54
|
| 226 |
+
%208 = and i32 %16, 511, !dbg !55
|
| 227 |
+
%209 = icmp eq i32 %208, 0, !dbg !55
|
| 228 |
+
%210 = and i1 %209, %15, !dbg !55
|
| 229 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %204, ptr addrspace(1) %207, i1 %210) #5, !dbg !55
|
| 230 |
+
%211 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56
|
| 231 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %206, ptr addrspace(1) %211, i1 %210) #5, !dbg !57
|
| 232 |
+
ret void, !dbg !58
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 236 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 237 |
+
|
| 238 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 239 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 240 |
+
|
| 241 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 242 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 243 |
+
|
| 244 |
+
; Function Attrs: convergent nocallback nounwind
|
| 245 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 246 |
+
|
| 247 |
+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 248 |
+
declare i64 @llvm.smin.i64(i64, i64) #4
|
| 249 |
+
|
| 250 |
+
attributes #0 = { nounwind "nvvm.reqntid"="512" }
|
| 251 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 252 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 253 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 254 |
+
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 255 |
+
attributes #5 = { nounwind }
|
| 256 |
+
|
| 257 |
+
!llvm.dbg.cu = !{!0}
|
| 258 |
+
!llvm.module.flags = !{!2, !3}
|
| 259 |
+
|
| 260 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 261 |
+
!1 = !DIFile(filename: "cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx")
|
| 262 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 263 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 264 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 265 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 266 |
+
!6 = !{}
|
| 267 |
+
!7 = !DILocation(line: 22, column: 28, scope: !4)
|
| 268 |
+
!8 = !DILocation(line: 24, column: 21, scope: !4)
|
| 269 |
+
!9 = !DILocation(line: 25, column: 37, scope: !4)
|
| 270 |
+
!10 = !DILocation(line: 27, column: 21, scope: !4)
|
| 271 |
+
!11 = !DILocation(line: 27, column: 28, scope: !4)
|
| 272 |
+
!12 = !DILocation(line: 28, column: 19, scope: !4)
|
| 273 |
+
!13 = !DILocation(line: 29, column: 19, scope: !4)
|
| 274 |
+
!14 = !DILocation(line: 39, column: 26, scope: !4)
|
| 275 |
+
!15 = !DILocation(line: 42, column: 26, scope: !4)
|
| 276 |
+
!16 = !DILocation(line: 49, column: 35, scope: !4)
|
| 277 |
+
!17 = !DILocation(line: 32, column: 40, scope: !4)
|
| 278 |
+
!18 = !DILocation(line: 86, column: 50, scope: !4)
|
| 279 |
+
!19 = !DILocation(line: 45, column: 22, scope: !4)
|
| 280 |
+
!20 = !DILocation(line: 39, column: 22, scope: !4)
|
| 281 |
+
!21 = !DILocation(line: 41, column: 22, scope: !4)
|
| 282 |
+
!22 = !DILocation(line: 48, column: 23, scope: !4)
|
| 283 |
+
!23 = !DILocation(line: 79, column: 24, scope: !4)
|
| 284 |
+
!24 = !DILocation(line: 69, column: 51, scope: !4)
|
| 285 |
+
!25 = !DILocation(line: 70, column: 25, scope: !4)
|
| 286 |
+
!26 = !DILocation(line: 33, column: 31, scope: !4)
|
| 287 |
+
!27 = !DILocation(line: 37, column: 27, scope: !4)
|
| 288 |
+
!28 = !DILocation(line: 49, column: 77, scope: !4)
|
| 289 |
+
!29 = !DILocation(line: 69, column: 24, scope: !4)
|
| 290 |
+
!30 = !DILocation(line: 69, column: 38, scope: !4)
|
| 291 |
+
!31 = !DILocation(line: 49, column: 94, scope: !4)
|
| 292 |
+
!32 = !DILocation(line: 50, column: 23, scope: !4)
|
| 293 |
+
!33 = !DILocation(line: 51, column: 23, scope: !4)
|
| 294 |
+
!34 = !DILocation(line: 52, column: 24, scope: !4)
|
| 295 |
+
!35 = !DILocation(line: 53, column: 23, scope: !4)
|
| 296 |
+
!36 = !DILocation(line: 58, column: 24, scope: !4)
|
| 297 |
+
!37 = !DILocation(line: 60, column: 25, scope: !4)
|
| 298 |
+
!38 = !DILocation(line: 66, column: 39, scope: !4)
|
| 299 |
+
!39 = !DILocation(line: 67, column: 24, scope: !4)
|
| 300 |
+
!40 = !DILocation(line: 71, column: 25, scope: !4)
|
| 301 |
+
!41 = !DILocation(line: 73, column: 25, scope: !4)
|
| 302 |
+
!42 = !DILocation(line: 74, column: 24, scope: !4)
|
| 303 |
+
!43 = !DILocation(line: 76, column: 39, scope: !4)
|
| 304 |
+
!44 = !DILocation(line: 78, column: 25, scope: !4)
|
| 305 |
+
!45 = !DILocation(line: 80, column: 24, scope: !4)
|
| 306 |
+
!46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49)
|
| 307 |
+
!47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0)
|
| 308 |
+
!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
|
| 309 |
+
!49 = !DILocation(line: 87, column: 27, scope: !4)
|
| 310 |
+
!50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49)
|
| 311 |
+
!51 = !DILocation(line: 92, column: 20, scope: !4)
|
| 312 |
+
!52 = !DILocation(line: 0, scope: !4)
|
| 313 |
+
!53 = !DILocation(line: 95, column: 21, scope: !4)
|
| 314 |
+
!54 = !DILocation(line: 98, column: 25, scope: !4)
|
| 315 |
+
!55 = !DILocation(line: 98, column: 37, scope: !4)
|
| 316 |
+
!56 = !DILocation(line: 99, column: 25, scope: !4)
|
| 317 |
+
!57 = !DILocation(line: 99, column: 37, scope: !4)
|
| 318 |
+
!58 = !DILocation(line: 99, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx
ADDED
|
@@ -0,0 +1,736 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1
|
| 12 |
+
.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2,
|
| 16 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3,
|
| 17 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4,
|
| 18 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5,
|
| 19 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6,
|
| 20 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7,
|
| 21 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8,
|
| 22 |
+
.param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9,
|
| 23 |
+
.param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10,
|
| 24 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11,
|
| 25 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12
|
| 26 |
+
)
|
| 27 |
+
.reqntid 512
|
| 28 |
+
{
|
| 29 |
+
.reg .pred %p<53>;
|
| 30 |
+
.reg .b32 %r<76>;
|
| 31 |
+
.reg .b64 %rd<162>;
|
| 32 |
+
.loc 1 18 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:18:0
|
| 33 |
+
$L__func_begin0:
|
| 34 |
+
.loc 1 18 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:18:0
|
| 35 |
+
|
| 36 |
+
// %bb.0:
|
| 37 |
+
ld.param.b64 %rd47, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4];
|
| 38 |
+
$L__tmp0:
|
| 39 |
+
.loc 1 22 28 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:22:28
|
| 40 |
+
mov.u32 %r7, %ctaid.x;
|
| 41 |
+
.loc 1 27 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:27:21
|
| 42 |
+
cvt.u64.u32 %rd1, %r7;
|
| 43 |
+
ld.param.b64 %rd52, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3];
|
| 44 |
+
and.b64 %rd53, %rd52, -4294967296;
|
| 45 |
+
setp.ne.b64 %p11, %rd53, 0;
|
| 46 |
+
cvt.u32.u64 %r74, %rd1;
|
| 47 |
+
@%p11 bra $L__BB0_2;
|
| 48 |
+
bra.uni $L__BB0_1;
|
| 49 |
+
$L__BB0_2:
|
| 50 |
+
div.s64 %rd153, %rd1, %rd52;
|
| 51 |
+
bra.uni $L__BB0_3;
|
| 52 |
+
$L__BB0_1:
|
| 53 |
+
cvt.u32.u64 %r8, %rd52;
|
| 54 |
+
div.u32 %r10, %r74, %r8;
|
| 55 |
+
cvt.u64.u32 %rd153, %r10;
|
| 56 |
+
$L__BB0_3:
|
| 57 |
+
.loc 1 0 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0:21
|
| 58 |
+
ld.param.b64 %rd50, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7];
|
| 59 |
+
.loc 1 27 28 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:27:28
|
| 60 |
+
or.b64 %rd54, %rd153, %rd47;
|
| 61 |
+
and.b64 %rd55, %rd54, -4294967296;
|
| 62 |
+
setp.ne.b64 %p12, %rd55, 0;
|
| 63 |
+
@%p12 bra $L__BB0_5;
|
| 64 |
+
bra.uni $L__BB0_4;
|
| 65 |
+
$L__BB0_5:
|
| 66 |
+
rem.s64 %rd154, %rd153, %rd47;
|
| 67 |
+
bra.uni $L__BB0_6;
|
| 68 |
+
$L__BB0_4:
|
| 69 |
+
cvt.u32.u64 %r11, %rd47;
|
| 70 |
+
cvt.u32.u64 %r12, %rd153;
|
| 71 |
+
rem.u32 %r13, %r12, %r11;
|
| 72 |
+
cvt.u64.u32 %rd154, %r13;
|
| 73 |
+
$L__BB0_6:
|
| 74 |
+
.loc 1 0 28 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0:28
|
| 75 |
+
ld.param.b32 %r6, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9];
|
| 76 |
+
ld.param.b64 %rd51, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8];
|
| 77 |
+
ld.param.b64 %rd49, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6];
|
| 78 |
+
ld.param.b64 %rd44, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0];
|
| 79 |
+
mov.u32 %r1, %tid.x;
|
| 80 |
+
.loc 1 28 19 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:28:19
|
| 81 |
+
mul.lo.s64 %rd56, %rd153, %rd52;
|
| 82 |
+
sub.s64 %rd9, %rd1, %rd56;
|
| 83 |
+
.loc 1 29 19 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:29:19
|
| 84 |
+
and.b64 %rd57, %rd50, -4294967296;
|
| 85 |
+
setp.ne.b64 %p13, %rd57, 0;
|
| 86 |
+
@%p13 bra $L__BB0_8;
|
| 87 |
+
bra.uni $L__BB0_7;
|
| 88 |
+
$L__BB0_8:
|
| 89 |
+
div.s64 %rd155, %rd1, %rd50;
|
| 90 |
+
bra.uni $L__BB0_9;
|
| 91 |
+
$L__BB0_7:
|
| 92 |
+
cvt.u32.u64 %r14, %rd50;
|
| 93 |
+
div.u32 %r16, %r74, %r14;
|
| 94 |
+
cvt.u64.u32 %rd155, %r16;
|
| 95 |
+
$L__BB0_9:
|
| 96 |
+
.loc 1 0 19 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0:19
|
| 97 |
+
ld.param.b64 %rd48, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5];
|
| 98 |
+
ld.param.b64 %rd46, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2];
|
| 99 |
+
ld.param.b64 %rd45, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1];
|
| 100 |
+
.loc 1 24 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:24:21
|
| 101 |
+
setp.lt.s32 %p1, %r74, %r6;
|
| 102 |
+
.loc 1 39 26 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:39:26
|
| 103 |
+
shl.b64 %rd16, %rd154, 7;
|
| 104 |
+
.loc 1 42 26 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:42:26
|
| 105 |
+
shl.b64 %rd61, %rd9, 7;
|
| 106 |
+
.loc 1 49 35 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:35
|
| 107 |
+
shl.b64 %rd62, %rd155, 3;
|
| 108 |
+
add.s64 %rd70, %rd44, %rd62;
|
| 109 |
+
and.b32 %r2, %r1, 127;
|
| 110 |
+
cvt.u64.u32 %rd63, %r2;
|
| 111 |
+
or.b64 %rd20, %rd61, %rd63;
|
| 112 |
+
setp.lt.s64 %p3, %rd20, %rd49;
|
| 113 |
+
setp.ge.s64 %p5, %rd20, %rd51;
|
| 114 |
+
min.s64 %rd15, %rd51, 0;
|
| 115 |
+
sub.s64 %rd64, %rd9, %rd154;
|
| 116 |
+
shl.b64 %rd22, %rd64, 7;
|
| 117 |
+
.loc 1 32 40 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:32:40
|
| 118 |
+
cvt.u64.u32 %rd65, %r1;
|
| 119 |
+
shr.u64 %rd66, %rd65, 7;
|
| 120 |
+
cvt.u32.u64 %r75, %rd66;
|
| 121 |
+
shr.u32 %r18, %r1, 7;
|
| 122 |
+
cvt.u64.u32 %rd67, %r18;
|
| 123 |
+
and.b64 %rd157, %rd67, 3;
|
| 124 |
+
sub.s64 %rd156, %rd63, %rd157;
|
| 125 |
+
mov.b64 %rd159, 0;
|
| 126 |
+
mov.b64 %rd158, -1024;
|
| 127 |
+
mov.b64 %rd160, %rd159;
|
| 128 |
+
bra.uni $L__BB0_10;
|
| 129 |
+
$L__BB0_12: // in Loop: Header=BB0_10 Depth=1
|
| 130 |
+
.loc 1 58 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:58:24
|
| 131 |
+
rem.s64 %rd161, %rd20, %rd51;
|
| 132 |
+
$L__BB0_13: // in Loop: Header=BB0_10 Depth=1
|
| 133 |
+
.loc 1 0 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0
|
| 134 |
+
sub.s32 %r21, %r2, %r20;
|
| 135 |
+
cvt.s64.s32 %rd33, %r21;
|
| 136 |
+
.loc 1 60 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:60:25
|
| 137 |
+
setp.eq.b64 %p24, %rd161, 0;
|
| 138 |
+
.loc 1 66 39 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:66:39
|
| 139 |
+
selp.b64 %rd83, 0, %rd15, %p24;
|
| 140 |
+
add.s64 %rd84, %rd83, %rd161;
|
| 141 |
+
.loc 1 67 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:67:24
|
| 142 |
+
setp.lt.s64 %p25, %rd84, %rd69;
|
| 143 |
+
setp.lt.s64 %p26, %rd84, %rd73;
|
| 144 |
+
.loc 1 69 51 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:69:51
|
| 145 |
+
add.s64 %rd85, %rd22, %rd33;
|
| 146 |
+
add.s64 %rd86, %rd22, %rd156;
|
| 147 |
+
.loc 1 70 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:70:25
|
| 148 |
+
rem.s64 %rd87, %rd86, %rd51;
|
| 149 |
+
rem.s64 %rd88, %rd85, %rd51;
|
| 150 |
+
.loc 1 71 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:71:25
|
| 151 |
+
setp.ne.b64 %p27, %rd88, 0;
|
| 152 |
+
setp.ne.b64 %p28, %rd87, 0;
|
| 153 |
+
.loc 1 73 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:73:25
|
| 154 |
+
xor.b64 %rd89, %rd87, %rd51;
|
| 155 |
+
xor.b64 %rd90, %rd88, %rd51;
|
| 156 |
+
.loc 1 76 39 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:76:39
|
| 157 |
+
shr.s64 %rd91, %rd89, 63;
|
| 158 |
+
and.b64 %rd92, %rd91, %rd51;
|
| 159 |
+
selp.b64 %rd93, %rd92, 0, %p28;
|
| 160 |
+
shr.s64 %rd94, %rd90, 63;
|
| 161 |
+
and.b64 %rd95, %rd94, %rd51;
|
| 162 |
+
selp.b64 %rd96, %rd95, 0, %p27;
|
| 163 |
+
.loc 1 78 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:78:25
|
| 164 |
+
neg.s64 %rd97, %rd96;
|
| 165 |
+
neg.s64 %rd98, %rd93;
|
| 166 |
+
setp.eq.b64 %p29, %rd87, %rd98;
|
| 167 |
+
setp.eq.b64 %p30, %rd88, %rd97;
|
| 168 |
+
.loc 1 79 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:79:24
|
| 169 |
+
and.pred %p31, %p26, %p30;
|
| 170 |
+
and.pred %p33, %p25, %p29;
|
| 171 |
+
and.pred %p35, %p5, %p33;
|
| 172 |
+
and.pred %p36, %p5, %p31;
|
| 173 |
+
.loc 1 80 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:80:24
|
| 174 |
+
or.pred %p37, %p10, %p36;
|
| 175 |
+
or.pred %p38, %p9, %p35;
|
| 176 |
+
.loc 1 86 50 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:86:50
|
| 177 |
+
and.pred %p41, %p14, %p38;
|
| 178 |
+
and.pred %p42, %p15, %p37;
|
| 179 |
+
selp.b64 %rd99, 1, 0, %p42;
|
| 180 |
+
selp.b64 %rd100, 1, 0, %p41;
|
| 181 |
+
add.s64 %rd159, %rd159, %rd100;
|
| 182 |
+
add.s64 %rd160, %rd160, %rd99;
|
| 183 |
+
.loc 1 32 40 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:32:40
|
| 184 |
+
add.s64 %rd158, %rd158, 1024;
|
| 185 |
+
add.s32 %r75, %r75, 8;
|
| 186 |
+
add.s64 %rd157, %rd157, 8;
|
| 187 |
+
add.s64 %rd156, %rd156, -8;
|
| 188 |
+
setp.lt.u64 %p43, %rd158, 15360;
|
| 189 |
+
@%p43 bra $L__BB0_10;
|
| 190 |
+
bra.uni $L__BB0_14;
|
| 191 |
+
$L__BB0_10: // =>This Inner Loop Header: Depth=1
|
| 192 |
+
.loc 1 37 27 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:37:27
|
| 193 |
+
or.b32 %r20, %r75, 4;
|
| 194 |
+
.loc 1 39 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:39:22
|
| 195 |
+
cvt.u64.u32 %rd76, %r20;
|
| 196 |
+
.loc 1 49 77 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:77
|
| 197 |
+
// begin inline asm
|
| 198 |
+
mov.u64 %rd68, 0x0;
|
| 199 |
+
createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0;
|
| 200 |
+
// end inline asm
|
| 201 |
+
.loc 1 39 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:39:22
|
| 202 |
+
or.b64 %rd77, %rd16, %rd76;
|
| 203 |
+
or.b64 %rd78, %rd16, %rd157;
|
| 204 |
+
.loc 1 41 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:41:22
|
| 205 |
+
setp.lt.s64 %p17, %rd78, %rd48;
|
| 206 |
+
setp.lt.s64 %p18, %rd77, %rd48;
|
| 207 |
+
.loc 1 45 22 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:45:22
|
| 208 |
+
and.pred %p8, %p3, %p18;
|
| 209 |
+
and.pred %p7, %p3, %p17;
|
| 210 |
+
.loc 1 48 23 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:48:23
|
| 211 |
+
setp.ge.s64 %p19, %rd77, %rd20;
|
| 212 |
+
setp.ge.s64 %p20, %rd78, %rd20;
|
| 213 |
+
.loc 1 49 94 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:94
|
| 214 |
+
and.pred %p14, %p1, %p7;
|
| 215 |
+
and.pred %p15, %p1, %p8;
|
| 216 |
+
.loc 1 49 77 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:49:77
|
| 217 |
+
// begin inline asm
|
| 218 |
+
mov.u64 %rd69, 0x0;
|
| 219 |
+
@%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd69 }, [ %rd70 + 0 ], %rd68;
|
| 220 |
+
// end inline asm
|
| 221 |
+
// begin inline asm
|
| 222 |
+
mov.u64 %rd72, 0x0;
|
| 223 |
+
createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0;
|
| 224 |
+
// end inline asm
|
| 225 |
+
// begin inline asm
|
| 226 |
+
mov.u64 %rd73, 0x0;
|
| 227 |
+
@%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd73 }, [ %rd70 + 0 ], %rd72;
|
| 228 |
+
// end inline asm
|
| 229 |
+
.loc 1 52 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:52:24
|
| 230 |
+
max.s64 %rd79, %rd20, %rd77;
|
| 231 |
+
setp.lt.s64 %p21, %rd79, %rd73;
|
| 232 |
+
max.s64 %rd80, %rd20, %rd78;
|
| 233 |
+
setp.lt.s64 %p22, %rd80, %rd69;
|
| 234 |
+
.loc 1 53 23 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:53:23
|
| 235 |
+
and.pred %p9, %p20, %p22;
|
| 236 |
+
and.pred %p10, %p19, %p21;
|
| 237 |
+
.loc 1 58 24 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:58:24
|
| 238 |
+
or.b64 %rd81, %rd20, %rd51;
|
| 239 |
+
and.b64 %rd82, %rd81, -4294967296;
|
| 240 |
+
setp.ne.b64 %p23, %rd82, 0;
|
| 241 |
+
@%p23 bra $L__BB0_12;
|
| 242 |
+
// %bb.11: // in Loop: Header=BB0_10 Depth=1
|
| 243 |
+
cvt.u32.u64 %r22, %rd51;
|
| 244 |
+
cvt.u32.u64 %r23, %rd20;
|
| 245 |
+
rem.u32 %r24, %r23, %r22;
|
| 246 |
+
cvt.u64.u32 %rd161, %r24;
|
| 247 |
+
bra.uni $L__BB0_13;
|
| 248 |
+
$L__BB0_14:
|
| 249 |
+
.loc 1 25 37 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:25:37
|
| 250 |
+
and.b32 %r31, %r1, 31;
|
| 251 |
+
$L__tmp1:
|
| 252 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 253 |
+
add.s64 %rd106, %rd159, %rd160;
|
| 254 |
+
mov.b64 {_, %r32}, %rd106;
|
| 255 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 256 |
+
cvt.u32.u64 %r33, %rd106;
|
| 257 |
+
shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1;
|
| 258 |
+
shfl.sync.bfly.b32 %r35, %r32, 16, 31, -1;
|
| 259 |
+
cvt.u64.u32 %rd107, %r34;
|
| 260 |
+
cvt.u64.u32 %rd108, %r35;
|
| 261 |
+
shl.b64 %rd109, %rd108, 32;
|
| 262 |
+
or.b64 %rd110, %rd107, %rd109;
|
| 263 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 264 |
+
add.s64 %rd111, %rd106, %rd110;
|
| 265 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 266 |
+
mov.b64 {_, %r36}, %rd111;
|
| 267 |
+
cvt.u32.u64 %r37, %rd111;
|
| 268 |
+
shfl.sync.bfly.b32 %r38, %r37, 8, 31, -1;
|
| 269 |
+
shfl.sync.bfly.b32 %r39, %r36, 8, 31, -1;
|
| 270 |
+
cvt.u64.u32 %rd112, %r38;
|
| 271 |
+
cvt.u64.u32 %rd113, %r39;
|
| 272 |
+
shl.b64 %rd114, %rd113, 32;
|
| 273 |
+
or.b64 %rd115, %rd112, %rd114;
|
| 274 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 275 |
+
add.s64 %rd116, %rd111, %rd115;
|
| 276 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 277 |
+
mov.b64 {_, %r40}, %rd116;
|
| 278 |
+
cvt.u32.u64 %r41, %rd116;
|
| 279 |
+
shfl.sync.bfly.b32 %r42, %r41, 4, 31, -1;
|
| 280 |
+
shfl.sync.bfly.b32 %r43, %r40, 4, 31, -1;
|
| 281 |
+
cvt.u64.u32 %rd117, %r42;
|
| 282 |
+
cvt.u64.u32 %rd118, %r43;
|
| 283 |
+
shl.b64 %rd119, %rd118, 32;
|
| 284 |
+
or.b64 %rd120, %rd117, %rd119;
|
| 285 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 286 |
+
add.s64 %rd121, %rd116, %rd120;
|
| 287 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 288 |
+
mov.b64 {_, %r44}, %rd121;
|
| 289 |
+
cvt.u32.u64 %r45, %rd121;
|
| 290 |
+
shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1;
|
| 291 |
+
shfl.sync.bfly.b32 %r47, %r44, 2, 31, -1;
|
| 292 |
+
cvt.u64.u32 %rd122, %r46;
|
| 293 |
+
cvt.u64.u32 %rd123, %r47;
|
| 294 |
+
shl.b64 %rd124, %rd123, 32;
|
| 295 |
+
or.b64 %rd125, %rd122, %rd124;
|
| 296 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 297 |
+
add.s64 %rd126, %rd121, %rd125;
|
| 298 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 299 |
+
mov.b64 {_, %r48}, %rd126;
|
| 300 |
+
cvt.u32.u64 %r49, %rd126;
|
| 301 |
+
shfl.sync.bfly.b32 %r50, %r49, 1, 31, -1;
|
| 302 |
+
shfl.sync.bfly.b32 %r51, %r48, 1, 31, -1;
|
| 303 |
+
cvt.u64.u32 %rd127, %r50;
|
| 304 |
+
cvt.u64.u32 %rd128, %r51;
|
| 305 |
+
shl.b64 %rd129, %rd128, 32;
|
| 306 |
+
or.b64 %rd130, %rd127, %rd129;
|
| 307 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 308 |
+
add.s64 %rd101, %rd126, %rd130;
|
| 309 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 310 |
+
setp.eq.b32 %p44, %r31, 0;
|
| 311 |
+
shr.u32 %r52, %r1, 2;
|
| 312 |
+
and.b32 %r53, %r52, 120;
|
| 313 |
+
mov.b32 %r54, global_smem;
|
| 314 |
+
add.s32 %r25, %r54, %r53;
|
| 315 |
+
// begin inline asm
|
| 316 |
+
@%p44 st.shared.b64 [ %r25 + 0 ], %rd101;
|
| 317 |
+
// end inline asm
|
| 318 |
+
bar.sync 0;
|
| 319 |
+
setp.lt.u32 %p45, %r1, 16;
|
| 320 |
+
shl.b32 %r55, %r1, 3;
|
| 321 |
+
add.s32 %r26, %r54, %r55;
|
| 322 |
+
// begin inline asm
|
| 323 |
+
@%p45 ld.shared.b64 %rd102, [ %r26 + 0 ];
|
| 324 |
+
// end inline asm
|
| 325 |
+
mov.b64 {_, %r56}, %rd102;
|
| 326 |
+
cvt.u32.u64 %r57, %rd102;
|
| 327 |
+
shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1;
|
| 328 |
+
shfl.sync.bfly.b32 %r59, %r56, 8, 31, -1;
|
| 329 |
+
cvt.u64.u32 %rd131, %r58;
|
| 330 |
+
cvt.u64.u32 %rd132, %r59;
|
| 331 |
+
shl.b64 %rd133, %rd132, 32;
|
| 332 |
+
or.b64 %rd134, %rd131, %rd133;
|
| 333 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 334 |
+
add.s64 %rd135, %rd102, %rd134;
|
| 335 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 336 |
+
mov.b64 {_, %r60}, %rd135;
|
| 337 |
+
cvt.u32.u64 %r61, %rd135;
|
| 338 |
+
shfl.sync.bfly.b32 %r62, %r61, 4, 31, -1;
|
| 339 |
+
shfl.sync.bfly.b32 %r63, %r60, 4, 31, -1;
|
| 340 |
+
cvt.u64.u32 %rd136, %r62;
|
| 341 |
+
cvt.u64.u32 %rd137, %r63;
|
| 342 |
+
shl.b64 %rd138, %rd137, 32;
|
| 343 |
+
or.b64 %rd139, %rd136, %rd138;
|
| 344 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 345 |
+
add.s64 %rd140, %rd135, %rd139;
|
| 346 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 347 |
+
mov.b64 {_, %r64}, %rd140;
|
| 348 |
+
cvt.u32.u64 %r65, %rd140;
|
| 349 |
+
shfl.sync.bfly.b32 %r66, %r65, 2, 31, -1;
|
| 350 |
+
shfl.sync.bfly.b32 %r67, %r64, 2, 31, -1;
|
| 351 |
+
cvt.u64.u32 %rd141, %r66;
|
| 352 |
+
cvt.u64.u32 %rd142, %r67;
|
| 353 |
+
shl.b64 %rd143, %rd142, 32;
|
| 354 |
+
or.b64 %rd144, %rd141, %rd143;
|
| 355 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 356 |
+
add.s64 %rd145, %rd140, %rd144;
|
| 357 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 358 |
+
mov.b64 {_, %r68}, %rd145;
|
| 359 |
+
cvt.u32.u64 %r69, %rd145;
|
| 360 |
+
shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1;
|
| 361 |
+
shfl.sync.bfly.b32 %r71, %r68, 1, 31, -1;
|
| 362 |
+
cvt.u64.u32 %rd146, %r70;
|
| 363 |
+
cvt.u64.u32 %rd147, %r71;
|
| 364 |
+
shl.b64 %rd148, %rd147, 32;
|
| 365 |
+
or.b64 %rd149, %rd146, %rd148;
|
| 366 |
+
.loc 2 261 15 // standard.py:261:15 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 367 |
+
add.s64 %rd103, %rd145, %rd149;
|
| 368 |
+
.loc 2 291 36 // standard.py:291:36 @[ cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:87:27 ]
|
| 369 |
+
setp.eq.b32 %p46, %r1, 0;
|
| 370 |
+
// begin inline asm
|
| 371 |
+
@%p46 st.shared.b64 [ %r26 + 0 ], %rd103;
|
| 372 |
+
// end inline asm
|
| 373 |
+
bar.sync 0;
|
| 374 |
+
ld.shared.b64 %rd150, [global_smem];
|
| 375 |
+
$L__tmp2:
|
| 376 |
+
.loc 1 92 20 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:92:20
|
| 377 |
+
add.s64 %rd151, %rd150, -1;
|
| 378 |
+
setp.lt.u64 %p50, %rd151, 16383;
|
| 379 |
+
.loc 1 0 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0
|
| 380 |
+
selp.b32 %r28, 1, 0, %p50;
|
| 381 |
+
.loc 1 95 21 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:95:21
|
| 382 |
+
setp.eq.b64 %p51, %rd150, 16384;
|
| 383 |
+
.loc 1 0 0 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:0
|
| 384 |
+
selp.b32 %r29, 1, 0, %p51;
|
| 385 |
+
.loc 1 98 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:98:25
|
| 386 |
+
shl.b64 %rd152, %rd1, 2;
|
| 387 |
+
add.s64 %rd104, %rd45, %rd152;
|
| 388 |
+
.loc 1 98 37 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:98:37
|
| 389 |
+
and.b32 %r72, %r1, 511;
|
| 390 |
+
setp.eq.b32 %p52, %r72, 0;
|
| 391 |
+
and.pred %p47, %p52, %p1;
|
| 392 |
+
// begin inline asm
|
| 393 |
+
@%p47 st.global.b32 [ %rd104 + 0 ], { %r28 };
|
| 394 |
+
// end inline asm
|
| 395 |
+
.loc 1 99 25 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:99:25
|
| 396 |
+
add.s64 %rd105, %rd46, %rd152;
|
| 397 |
+
.loc 1 99 37 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:99:37
|
| 398 |
+
// begin inline asm
|
| 399 |
+
@%p47 st.global.b32 [ %rd105 + 0 ], { %r29 };
|
| 400 |
+
// end inline asm
|
| 401 |
+
.loc 1 99 4 // cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py:99:4
|
| 402 |
+
ret;
|
| 403 |
+
$L__tmp3:
|
| 404 |
+
$L__func_end0:
|
| 405 |
+
// -- End function
|
| 406 |
+
}
|
| 407 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py"
|
| 408 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 409 |
+
.section .debug_abbrev
|
| 410 |
+
{
|
| 411 |
+
.b8 1 // Abbreviation Code
|
| 412 |
+
.b8 17 // DW_TAG_compile_unit
|
| 413 |
+
.b8 1 // DW_CHILDREN_yes
|
| 414 |
+
.b8 37 // DW_AT_producer
|
| 415 |
+
.b8 8 // DW_FORM_string
|
| 416 |
+
.b8 19 // DW_AT_language
|
| 417 |
+
.b8 5 // DW_FORM_data2
|
| 418 |
+
.b8 3 // DW_AT_name
|
| 419 |
+
.b8 8 // DW_FORM_string
|
| 420 |
+
.b8 16 // DW_AT_stmt_list
|
| 421 |
+
.b8 6 // DW_FORM_data4
|
| 422 |
+
.b8 27 // DW_AT_comp_dir
|
| 423 |
+
.b8 8 // DW_FORM_string
|
| 424 |
+
.b8 0 // EOM(1)
|
| 425 |
+
.b8 0 // EOM(2)
|
| 426 |
+
.b8 2 // Abbreviation Code
|
| 427 |
+
.b8 46 // DW_TAG_subprogram
|
| 428 |
+
.b8 0 // DW_CHILDREN_no
|
| 429 |
+
.b8 3 // DW_AT_name
|
| 430 |
+
.b8 8 // DW_FORM_string
|
| 431 |
+
.b8 32 // DW_AT_inline
|
| 432 |
+
.b8 11 // DW_FORM_data1
|
| 433 |
+
.b8 0 // EOM(1)
|
| 434 |
+
.b8 0 // EOM(2)
|
| 435 |
+
.b8 3 // Abbreviation Code
|
| 436 |
+
.b8 46 // DW_TAG_subprogram
|
| 437 |
+
.b8 1 // DW_CHILDREN_yes
|
| 438 |
+
.b8 17 // DW_AT_low_pc
|
| 439 |
+
.b8 1 // DW_FORM_addr
|
| 440 |
+
.b8 18 // DW_AT_high_pc
|
| 441 |
+
.b8 1 // DW_FORM_addr
|
| 442 |
+
.b8 49 // DW_AT_abstract_origin
|
| 443 |
+
.b8 19 // DW_FORM_ref4
|
| 444 |
+
.b8 0 // EOM(1)
|
| 445 |
+
.b8 0 // EOM(2)
|
| 446 |
+
.b8 4 // Abbreviation Code
|
| 447 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 448 |
+
.b8 0 // DW_CHILDREN_no
|
| 449 |
+
.b8 49 // DW_AT_abstract_origin
|
| 450 |
+
.b8 19 // DW_FORM_ref4
|
| 451 |
+
.b8 17 // DW_AT_low_pc
|
| 452 |
+
.b8 1 // DW_FORM_addr
|
| 453 |
+
.b8 18 // DW_AT_high_pc
|
| 454 |
+
.b8 1 // DW_FORM_addr
|
| 455 |
+
.b8 88 // DW_AT_call_file
|
| 456 |
+
.b8 11 // DW_FORM_data1
|
| 457 |
+
.b8 89 // DW_AT_call_line
|
| 458 |
+
.b8 11 // DW_FORM_data1
|
| 459 |
+
.b8 87 // DW_AT_call_column
|
| 460 |
+
.b8 11 // DW_FORM_data1
|
| 461 |
+
.b8 0 // EOM(1)
|
| 462 |
+
.b8 0 // EOM(2)
|
| 463 |
+
.b8 0 // EOM(3)
|
| 464 |
+
}
|
| 465 |
+
.section .debug_info
|
| 466 |
+
{
|
| 467 |
+
.b32 307 // Length of Unit
|
| 468 |
+
.b8 2 // DWARF version number
|
| 469 |
+
.b8 0
|
| 470 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 471 |
+
.b8 8 // Address Size (in bytes)
|
| 472 |
+
.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
|
| 473 |
+
.b8 116 // DW_AT_producer
|
| 474 |
+
.b8 114
|
| 475 |
+
.b8 105
|
| 476 |
+
.b8 116
|
| 477 |
+
.b8 111
|
| 478 |
+
.b8 110
|
| 479 |
+
.b8 0
|
| 480 |
+
.b8 2 // DW_AT_language
|
| 481 |
+
.b8 0
|
| 482 |
+
.b8 99 // DW_AT_name
|
| 483 |
+
.b8 106
|
| 484 |
+
.b8 120
|
| 485 |
+
.b8 116
|
| 486 |
+
.b8 101
|
| 487 |
+
.b8 122
|
| 488 |
+
.b8 120
|
| 489 |
+
.b8 52
|
| 490 |
+
.b8 52
|
| 491 |
+
.b8 118
|
| 492 |
+
.b8 109
|
| 493 |
+
.b8 113
|
| 494 |
+
.b8 104
|
| 495 |
+
.b8 54
|
| 496 |
+
.b8 50
|
| 497 |
+
.b8 50
|
| 498 |
+
.b8 102
|
| 499 |
+
.b8 51
|
| 500 |
+
.b8 116
|
| 501 |
+
.b8 112
|
| 502 |
+
.b8 109
|
| 503 |
+
.b8 97
|
| 504 |
+
.b8 107
|
| 505 |
+
.b8 108
|
| 506 |
+
.b8 111
|
| 507 |
+
.b8 102
|
| 508 |
+
.b8 53
|
| 509 |
+
.b8 54
|
| 510 |
+
.b8 98
|
| 511 |
+
.b8 114
|
| 512 |
+
.b8 52
|
| 513 |
+
.b8 101
|
| 514 |
+
.b8 121
|
| 515 |
+
.b8 108
|
| 516 |
+
.b8 116
|
| 517 |
+
.b8 51
|
| 518 |
+
.b8 110
|
| 519 |
+
.b8 122
|
| 520 |
+
.b8 52
|
| 521 |
+
.b8 97
|
| 522 |
+
.b8 52
|
| 523 |
+
.b8 54
|
| 524 |
+
.b8 107
|
| 525 |
+
.b8 97
|
| 526 |
+
.b8 118
|
| 527 |
+
.b8 118
|
| 528 |
+
.b8 122
|
| 529 |
+
.b8 50
|
| 530 |
+
.b8 103
|
| 531 |
+
.b8 119
|
| 532 |
+
.b8 113
|
| 533 |
+
.b8 119
|
| 534 |
+
.b8 46
|
| 535 |
+
.b8 112
|
| 536 |
+
.b8 121
|
| 537 |
+
.b8 0
|
| 538 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 539 |
+
.b8 47 // DW_AT_comp_dir
|
| 540 |
+
.b8 119
|
| 541 |
+
.b8 111
|
| 542 |
+
.b8 114
|
| 543 |
+
.b8 107
|
| 544 |
+
.b8 115
|
| 545 |
+
.b8 112
|
| 546 |
+
.b8 97
|
| 547 |
+
.b8 99
|
| 548 |
+
.b8 101
|
| 549 |
+
.b8 47
|
| 550 |
+
.b8 104
|
| 551 |
+
.b8 97
|
| 552 |
+
.b8 110
|
| 553 |
+
.b8 114
|
| 554 |
+
.b8 117
|
| 555 |
+
.b8 105
|
| 556 |
+
.b8 47
|
| 557 |
+
.b8 83
|
| 558 |
+
.b8 112
|
| 559 |
+
.b8 101
|
| 560 |
+
.b8 99
|
| 561 |
+
.b8 70
|
| 562 |
+
.b8 111
|
| 563 |
+
.b8 114
|
| 564 |
+
.b8 103
|
| 565 |
+
.b8 101
|
| 566 |
+
.b8 45
|
| 567 |
+
.b8 101
|
| 568 |
+
.b8 120
|
| 569 |
+
.b8 116
|
| 570 |
+
.b8 47
|
| 571 |
+
.b8 99
|
| 572 |
+
.b8 97
|
| 573 |
+
.b8 99
|
| 574 |
+
.b8 104
|
| 575 |
+
.b8 101
|
| 576 |
+
.b8 47
|
| 577 |
+
.b8 99
|
| 578 |
+
.b8 111
|
| 579 |
+
.b8 109
|
| 580 |
+
.b8 112
|
| 581 |
+
.b8 105
|
| 582 |
+
.b8 108
|
| 583 |
+
.b8 101
|
| 584 |
+
.b8 100
|
| 585 |
+
.b8 95
|
| 586 |
+
.b8 107
|
| 587 |
+
.b8 101
|
| 588 |
+
.b8 114
|
| 589 |
+
.b8 110
|
| 590 |
+
.b8 101
|
| 591 |
+
.b8 108
|
| 592 |
+
.b8 115
|
| 593 |
+
.b8 47
|
| 594 |
+
.b8 106
|
| 595 |
+
.b8 120
|
| 596 |
+
.b8 0
|
| 597 |
+
.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram
|
| 598 |
+
.b8 116 // DW_AT_name
|
| 599 |
+
.b8 114
|
| 600 |
+
.b8 105
|
| 601 |
+
.b8 116
|
| 602 |
+
.b8 111
|
| 603 |
+
.b8 110
|
| 604 |
+
.b8 95
|
| 605 |
+
.b8 114
|
| 606 |
+
.b8 101
|
| 607 |
+
.b8 100
|
| 608 |
+
.b8 95
|
| 609 |
+
.b8 102
|
| 610 |
+
.b8 117
|
| 611 |
+
.b8 115
|
| 612 |
+
.b8 101
|
| 613 |
+
.b8 100
|
| 614 |
+
.b8 95
|
| 615 |
+
.b8 95
|
| 616 |
+
.b8 116
|
| 617 |
+
.b8 111
|
| 618 |
+
.b8 95
|
| 619 |
+
.b8 99
|
| 620 |
+
.b8 111
|
| 621 |
+
.b8 112
|
| 622 |
+
.b8 121
|
| 623 |
+
.b8 95
|
| 624 |
+
.b8 97
|
| 625 |
+
.b8 114
|
| 626 |
+
.b8 97
|
| 627 |
+
.b8 110
|
| 628 |
+
.b8 103
|
| 629 |
+
.b8 101
|
| 630 |
+
.b8 95
|
| 631 |
+
.b8 98
|
| 632 |
+
.b8 105
|
| 633 |
+
.b8 116
|
| 634 |
+
.b8 119
|
| 635 |
+
.b8 105
|
| 636 |
+
.b8 115
|
| 637 |
+
.b8 101
|
| 638 |
+
.b8 95
|
| 639 |
+
.b8 97
|
| 640 |
+
.b8 110
|
| 641 |
+
.b8 100
|
| 642 |
+
.b8 95
|
| 643 |
+
.b8 98
|
| 644 |
+
.b8 105
|
| 645 |
+
.b8 116
|
| 646 |
+
.b8 119
|
| 647 |
+
.b8 105
|
| 648 |
+
.b8 115
|
| 649 |
+
.b8 101
|
| 650 |
+
.b8 95
|
| 651 |
+
.b8 111
|
| 652 |
+
.b8 114
|
| 653 |
+
.b8 95
|
| 654 |
+
.b8 99
|
| 655 |
+
.b8 111
|
| 656 |
+
.b8 110
|
| 657 |
+
.b8 115
|
| 658 |
+
.b8 116
|
| 659 |
+
.b8 97
|
| 660 |
+
.b8 110
|
| 661 |
+
.b8 116
|
| 662 |
+
.b8 95
|
| 663 |
+
.b8 112
|
| 664 |
+
.b8 97
|
| 665 |
+
.b8 100
|
| 666 |
+
.b8 95
|
| 667 |
+
.b8 110
|
| 668 |
+
.b8 100
|
| 669 |
+
.b8 95
|
| 670 |
+
.b8 101
|
| 671 |
+
.b8 113
|
| 672 |
+
.b8 95
|
| 673 |
+
.b8 103
|
| 674 |
+
.b8 101
|
| 675 |
+
.b8 95
|
| 676 |
+
.b8 103
|
| 677 |
+
.b8 116
|
| 678 |
+
.b8 95
|
| 679 |
+
.b8 105
|
| 680 |
+
.b8 110
|
| 681 |
+
.b8 100
|
| 682 |
+
.b8 101
|
| 683 |
+
.b8 120
|
| 684 |
+
.b8 95
|
| 685 |
+
.b8 108
|
| 686 |
+
.b8 116
|
| 687 |
+
.b8 95
|
| 688 |
+
.b8 112
|
| 689 |
+
.b8 101
|
| 690 |
+
.b8 114
|
| 691 |
+
.b8 109
|
| 692 |
+
.b8 117
|
| 693 |
+
.b8 116
|
| 694 |
+
.b8 101
|
| 695 |
+
.b8 95
|
| 696 |
+
.b8 114
|
| 697 |
+
.b8 101
|
| 698 |
+
.b8 109
|
| 699 |
+
.b8 97
|
| 700 |
+
.b8 105
|
| 701 |
+
.b8 110
|
| 702 |
+
.b8 100
|
| 703 |
+
.b8 101
|
| 704 |
+
.b8 114
|
| 705 |
+
.b8 95
|
| 706 |
+
.b8 115
|
| 707 |
+
.b8 117
|
| 708 |
+
.b8 98
|
| 709 |
+
.b8 95
|
| 710 |
+
.b8 115
|
| 711 |
+
.b8 117
|
| 712 |
+
.b8 109
|
| 713 |
+
.b8 95
|
| 714 |
+
.b8 118
|
| 715 |
+
.b8 105
|
| 716 |
+
.b8 101
|
| 717 |
+
.b8 119
|
| 718 |
+
.b8 95
|
| 719 |
+
.b8 49
|
| 720 |
+
.b8 0
|
| 721 |
+
.b8 1 // DW_AT_inline
|
| 722 |
+
.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
|
| 723 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 724 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 725 |
+
.b32 139 // DW_AT_abstract_origin
|
| 726 |
+
.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
|
| 727 |
+
.b32 139 // DW_AT_abstract_origin
|
| 728 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 729 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 730 |
+
.b8 1 // DW_AT_call_file
|
| 731 |
+
.b8 87 // DW_AT_call_line
|
| 732 |
+
.b8 27 // DW_AT_call_column
|
| 733 |
+
.b8 0 // End Of Children Mark
|
| 734 |
+
.b8 0 // End Of Children Mark
|
| 735 |
+
}
|
| 736 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":18:0)
|
| 2 |
+
#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
|
| 3 |
+
#loc99 = loc(unknown)
|
| 4 |
+
#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
|
| 5 |
+
#loc106 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc107 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc108 = loc("out_ptr2"(#loc))
|
| 8 |
+
#loc109 = loc("ks0"(#loc))
|
| 9 |
+
#loc110 = loc("ks1"(#loc))
|
| 10 |
+
#loc111 = loc("ks2"(#loc))
|
| 11 |
+
#loc112 = loc("ks3"(#loc))
|
| 12 |
+
#loc113 = loc("ks4"(#loc))
|
| 13 |
+
#loc114 = loc("ks5"(#loc))
|
| 14 |
+
#loc115 = loc("xnumel"(#loc))
|
| 15 |
+
#loc116 = loc("r0_numel"(#loc))
|
| 16 |
+
#loc207 = loc("input"(#loc97))
|
| 17 |
+
#loc208 = loc("a"(#loc102))
|
| 18 |
+
#loc209 = loc("b"(#loc102))
|
| 19 |
+
module {
|
| 20 |
+
tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 21 |
+
%r0_numel_0 = arith.constant 16384 : i32 loc(#loc117)
|
| 22 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc118)
|
| 23 |
+
%xoffset_1 = arith.constant 1 : i32 loc(#loc119)
|
| 24 |
+
%xoffset_2 = arith.constant 1 : i32 loc(#loc119)
|
| 25 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119)
|
| 26 |
+
%xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120)
|
| 27 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121)
|
| 28 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122)
|
| 29 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122)
|
| 30 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123)
|
| 31 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123)
|
| 32 |
+
%r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc124)
|
| 33 |
+
%r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc125)
|
| 34 |
+
%x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126)
|
| 35 |
+
%x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126)
|
| 36 |
+
%x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126)
|
| 37 |
+
%x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127)
|
| 38 |
+
%x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127)
|
| 39 |
+
%x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128)
|
| 40 |
+
%x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128)
|
| 41 |
+
%x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128)
|
| 42 |
+
%x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129)
|
| 43 |
+
%x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129)
|
| 44 |
+
%x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129)
|
| 45 |
+
%_tmp46 = arith.constant 0 : i64 loc(#loc130)
|
| 46 |
+
%_tmp46_17 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc130)
|
| 47 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc15)
|
| 48 |
+
%c1024_i32 = arith.constant 1024 : i32 loc(#loc15)
|
| 49 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
|
| 50 |
+
%1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15)
|
| 51 |
+
%2 = arith.bitcast %c1024_i32 : i32 to i32 loc(#loc15)
|
| 52 |
+
%3 = ub.poison : i32 loc(#loc15)
|
| 53 |
+
%_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x1024xi64>) : i32 {
|
| 54 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc132)
|
| 55 |
+
%r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x1024xi32> loc(#loc132)
|
| 56 |
+
%r0_mask = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc133)
|
| 57 |
+
%r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x1024xi32> loc(#loc133)
|
| 58 |
+
%r0_4 = arith.constant 128 : i32 loc(#loc134)
|
| 59 |
+
%r0_4_25 = arith.constant 128 : i32 loc(#loc134)
|
| 60 |
+
%r0_4_26 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc134)
|
| 61 |
+
%r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x1024xi32> loc(#loc134)
|
| 62 |
+
%r0_3 = arith.constant 128 : i32 loc(#loc135)
|
| 63 |
+
%r0_3_28 = arith.constant 128 : i32 loc(#loc135)
|
| 64 |
+
%r0_3_29 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc135)
|
| 65 |
+
%r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x1024xi32> loc(#loc135)
|
| 66 |
+
%tmp0 = arith.constant 128 : i32 loc(#loc136)
|
| 67 |
+
%tmp0_31 = arith.constant 128 : i64 loc(#loc136)
|
| 68 |
+
%tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136)
|
| 69 |
+
%tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136)
|
| 70 |
+
%tmp0_34 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc137)
|
| 71 |
+
%tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc137)
|
| 72 |
+
%tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x1024xi64> loc(#loc137)
|
| 73 |
+
%tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc138)
|
| 74 |
+
%tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x1024xi64> loc(#loc138)
|
| 75 |
+
%tmp3 = arith.constant 128 : i32 loc(#loc139)
|
| 76 |
+
%tmp3_38 = arith.constant 128 : i64 loc(#loc139)
|
| 77 |
+
%tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139)
|
| 78 |
+
%tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139)
|
| 79 |
+
%tmp3_41 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc140)
|
| 80 |
+
%tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc140)
|
| 81 |
+
%tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x1024xi64> loc(#loc140)
|
| 82 |
+
%tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc141)
|
| 83 |
+
%tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x1024xi64> loc(#loc141)
|
| 84 |
+
%tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x1024xi1> loc(#loc142)
|
| 85 |
+
%tmp7 = arith.constant 128 : i32 loc(#loc143)
|
| 86 |
+
%tmp7_45 = arith.constant 128 : i64 loc(#loc143)
|
| 87 |
+
%tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143)
|
| 88 |
+
%tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143)
|
| 89 |
+
%tmp7_48 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc144)
|
| 90 |
+
%tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc144)
|
| 91 |
+
%tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x1024xi64> loc(#loc144)
|
| 92 |
+
%tmp8 = arith.constant 128 : i32 loc(#loc145)
|
| 93 |
+
%tmp8_51 = arith.constant 128 : i64 loc(#loc145)
|
| 94 |
+
%tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145)
|
| 95 |
+
%tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145)
|
| 96 |
+
%tmp8_54 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc146)
|
| 97 |
+
%tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc146)
|
| 98 |
+
%tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x1024xi64> loc(#loc146)
|
| 99 |
+
%tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x1024xi64> loc(#loc147)
|
| 100 |
+
%tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc148)
|
| 101 |
+
%tmp10_57 = tt.splat %in_ptr0 : !tt.ptr<i64> -> tensor<1x1024x!tt.ptr<i64>> loc(#loc149)
|
| 102 |
+
%tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x1024x!tt.ptr<i64>>, tensor<1x1024xi64> loc(#loc149)
|
| 103 |
+
%tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x1024xi1> loc(#loc150)
|
| 104 |
+
%tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc151)
|
| 105 |
+
%tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x1024xi1> loc(#loc151)
|
| 106 |
+
%tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152)
|
| 107 |
+
%tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32> loc(#loc152)
|
| 108 |
+
%tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x1024xf32> to tensor<1x1024xi64> loc(#loc152)
|
| 109 |
+
%tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr<i64>> loc(#loc152)
|
| 110 |
+
%tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x1024xi64> loc(#loc153)
|
| 111 |
+
%tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x1024xi64> loc(#loc154)
|
| 112 |
+
%tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc155)
|
| 113 |
+
%tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc156)
|
| 114 |
+
%tmp15 = arith.constant false loc(#loc157)
|
| 115 |
+
%tmp15_66 = arith.constant dense<false> : tensor<1x1xi1> loc(#loc157)
|
| 116 |
+
%tmp16 = arith.constant dense<false> : tensor<1x1024xi1> loc(#loc158)
|
| 117 |
+
%tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x1024xi1> loc(#loc158)
|
| 118 |
+
%tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc159)
|
| 119 |
+
%tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc160)
|
| 120 |
+
%tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc161)
|
| 121 |
+
%tmp20 = arith.constant 0 : i32 loc(#loc162)
|
| 122 |
+
%tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162)
|
| 123 |
+
%tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163)
|
| 124 |
+
%tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc163)
|
| 125 |
+
%tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x1024xi64> loc(#loc163)
|
| 126 |
+
%tmp22 = arith.constant 0 : i32 loc(#loc164)
|
| 127 |
+
%tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164)
|
| 128 |
+
%tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x1024xi64> loc(#loc164)
|
| 129 |
+
%tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x1024xi64> loc(#loc164)
|
| 130 |
+
%tmp23 = arith.constant 0 : i32 loc(#loc165)
|
| 131 |
+
%tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165)
|
| 132 |
+
%tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x1024xi64> loc(#loc165)
|
| 133 |
+
%tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x1024xi64> loc(#loc165)
|
| 134 |
+
%tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x1024xi1> loc(#loc166)
|
| 135 |
+
%tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x1024xi1> loc(#loc167)
|
| 136 |
+
%tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc168)
|
| 137 |
+
%tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc169)
|
| 138 |
+
%tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x1024xi64> loc(#loc170)
|
| 139 |
+
%tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc171)
|
| 140 |
+
%tmp30 = arith.constant -1 : i32 loc(#loc172)
|
| 141 |
+
%tmp30_77 = arith.constant -1 : i32 loc(#loc172)
|
| 142 |
+
%tmp30_78 = arith.constant dense<-1> : tensor<1x1024xi32> loc(#loc172)
|
| 143 |
+
%tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x1024xi32> loc(#loc172)
|
| 144 |
+
%tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x1024xi32> loc(#loc173)
|
| 145 |
+
%tmp30_81 = arith.constant -128 : i32 loc(#loc174)
|
| 146 |
+
%tmp30_82 = arith.constant -128 : i64 loc(#loc174)
|
| 147 |
+
%tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174)
|
| 148 |
+
%tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174)
|
| 149 |
+
%tmp30_85 = arith.extsi %tmp30_80 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc175)
|
| 150 |
+
%tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc175)
|
| 151 |
+
%tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x1024xi64> loc(#loc175)
|
| 152 |
+
%tmp30_88 = arith.constant 128 : i32 loc(#loc176)
|
| 153 |
+
%tmp30_89 = arith.constant 128 : i64 loc(#loc176)
|
| 154 |
+
%tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176)
|
| 155 |
+
%tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176)
|
| 156 |
+
%tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc177)
|
| 157 |
+
%tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x1024xi64> loc(#loc177)
|
| 158 |
+
%tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x1024xi64> loc(#loc178)
|
| 159 |
+
%tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179)
|
| 160 |
+
%tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc179)
|
| 161 |
+
%tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x1024xi64> loc(#loc179)
|
| 162 |
+
%tmp33 = arith.constant 0 : i32 loc(#loc180)
|
| 163 |
+
%tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180)
|
| 164 |
+
%tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x1024xi64> loc(#loc180)
|
| 165 |
+
%tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x1024xi64> loc(#loc180)
|
| 166 |
+
%tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x1024xi1> loc(#loc181)
|
| 167 |
+
%tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x1024xi1> loc(#loc182)
|
| 168 |
+
%tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc183)
|
| 169 |
+
%tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc184)
|
| 170 |
+
%tmp38 = arith.constant 0 : i64 loc(#loc185)
|
| 171 |
+
%tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185)
|
| 172 |
+
%tmp39 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc186)
|
| 173 |
+
%tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x1024xi64> loc(#loc186)
|
| 174 |
+
%tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x1024xi1> loc(#loc187)
|
| 175 |
+
%tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x1024xi1> loc(#loc188)
|
| 176 |
+
%tmp42 = arith.constant false loc(#loc189)
|
| 177 |
+
%tmp42_101 = arith.constant dense<false> : tensor<1x1024xi1> loc(#loc189)
|
| 178 |
+
%tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc190)
|
| 179 |
+
%tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc191)
|
| 180 |
+
%tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x1024xi64> loc(#loc192)
|
| 181 |
+
%_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc193)
|
| 182 |
+
%_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x1024xi1> loc(#loc193)
|
| 183 |
+
%_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc194)
|
| 184 |
+
scf.yield %_tmp46_104 : tensor<1x1024xi64> loc(#loc79)
|
| 185 |
+
} loc(#loc131)
|
| 186 |
+
%tmp46 = tt.call @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc195)
|
| 187 |
+
%tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196)
|
| 188 |
+
%tmp48 = arith.constant 0 : i64 loc(#loc197)
|
| 189 |
+
%tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197)
|
| 190 |
+
%tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198)
|
| 191 |
+
%tmp50 = arith.constant 16384 : i64 loc(#loc199)
|
| 192 |
+
%tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199)
|
| 193 |
+
%tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200)
|
| 194 |
+
%tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201)
|
| 195 |
+
%tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202)
|
| 196 |
+
%tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203)
|
| 197 |
+
%tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204)
|
| 198 |
+
%tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205)
|
| 199 |
+
%tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206)
|
| 200 |
+
%4 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc92)
|
| 201 |
+
%5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc92)
|
| 202 |
+
tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc93)
|
| 203 |
+
%6 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc94)
|
| 204 |
+
%7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc94)
|
| 205 |
+
tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc95)
|
| 206 |
+
tt.return loc(#loc96)
|
| 207 |
+
} loc(#loc)
|
| 208 |
+
tt.func private @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x1024xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} {
|
| 209 |
+
%0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 210 |
+
^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)):
|
| 211 |
+
%2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98)
|
| 212 |
+
tt.reduce.return %2 : i64 loc(#loc98)
|
| 213 |
+
}) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc98)
|
| 214 |
+
tt.return %0 : tensor<1xi64> loc(#loc100)
|
| 215 |
+
^bb1: // no predecessors
|
| 216 |
+
%1 = ub.poison : tensor<1xi64> loc(#loc101)
|
| 217 |
+
tt.return %1 : tensor<1xi64> loc(#loc101)
|
| 218 |
+
} loc(#loc97)
|
| 219 |
+
tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} {
|
| 220 |
+
%0 = arith.addi %a, %b : i64 loc(#loc103)
|
| 221 |
+
tt.return %0 : i64 loc(#loc104)
|
| 222 |
+
^bb1: // no predecessors
|
| 223 |
+
%1 = ub.poison : i64 loc(#loc105)
|
| 224 |
+
tt.return %1 : i64 loc(#loc105)
|
| 225 |
+
} loc(#loc102)
|
| 226 |
+
} loc(#loc)
|
| 227 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":19:15)
|
| 228 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:28)
|
| 229 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:33)
|
| 230 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":23:36)
|
| 231 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":23:44)
|
| 232 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":23:23)
|
| 233 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":24:21)
|
| 234 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:27)
|
| 235 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:37)
|
| 236 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:21)
|
| 237 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:28)
|
| 238 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":28:19)
|
| 239 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":29:19)
|
| 240 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":30:44)
|
| 241 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":32:40)
|
| 242 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":33:31)
|
| 243 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":34:29)
|
| 244 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":37:27)
|
| 245 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":38:27)
|
| 246 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:26)
|
| 247 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:22)
|
| 248 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":41:22)
|
| 249 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:26)
|
| 250 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:22)
|
| 251 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":44:22)
|
| 252 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":45:22)
|
| 253 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":46:26)
|
| 254 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":46:22)
|
| 255 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":47:26)
|
| 256 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":47:22)
|
| 257 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":48:23)
|
| 258 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:55)
|
| 259 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:35)
|
| 260 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:87)
|
| 261 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:94)
|
| 262 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:77)
|
| 263 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":50:23)
|
| 264 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":51:23)
|
| 265 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":52:24)
|
| 266 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":53:23)
|
| 267 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":54:39)
|
| 268 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":55:24)
|
| 269 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":56:37)
|
| 270 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":57:24)
|
| 271 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":58:24)
|
| 272 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":59:35)
|
| 273 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":60:25)
|
| 274 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":61:92)
|
| 275 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":62:92)
|
| 276 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":63:25)
|
| 277 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":64:24)
|
| 278 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":65:24)
|
| 279 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":66:39)
|
| 280 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":67:24)
|
| 281 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":68:24)
|
| 282 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:29)
|
| 283 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:24)
|
| 284 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:45)
|
| 285 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:38)
|
| 286 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:55)
|
| 287 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:51)
|
| 288 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":70:25)
|
| 289 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":71:25)
|
| 290 |
+
#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":72:92)
|
| 291 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":73:25)
|
| 292 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":74:24)
|
| 293 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":75:24)
|
| 294 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":76:39)
|
| 295 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":77:35)
|
| 296 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":78:25)
|
| 297 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":79:24)
|
| 298 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":80:24)
|
| 299 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":81:44)
|
| 300 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":82:38)
|
| 301 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":83:25)
|
| 302 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":85:25)
|
| 303 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:36)
|
| 304 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:50)
|
| 305 |
+
#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:8)
|
| 306 |
+
#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:27)
|
| 307 |
+
#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:30)
|
| 308 |
+
#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":88:31)
|
| 309 |
+
#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":89:20)
|
| 310 |
+
#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":90:35)
|
| 311 |
+
#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":91:20)
|
| 312 |
+
#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":92:20)
|
| 313 |
+
#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":93:21)
|
| 314 |
+
#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":94:21)
|
| 315 |
+
#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":95:21)
|
| 316 |
+
#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":96:21)
|
| 317 |
+
#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":97:21)
|
| 318 |
+
#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:25)
|
| 319 |
+
#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:37)
|
| 320 |
+
#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:25)
|
| 321 |
+
#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:37)
|
| 322 |
+
#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:4)
|
| 323 |
+
#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 324 |
+
#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
|
| 325 |
+
#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
|
| 326 |
+
#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 327 |
+
#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
|
| 328 |
+
#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
|
| 329 |
+
#loc117 = loc("r0_numel"(#loc1))
|
| 330 |
+
#loc118 = loc("xoffset"(#loc2))
|
| 331 |
+
#loc119 = loc("xoffset"(#loc3))
|
| 332 |
+
#loc120 = loc("xindex"(#loc4))
|
| 333 |
+
#loc121 = loc("xindex"(#loc5))
|
| 334 |
+
#loc122 = loc("xindex"(#loc6))
|
| 335 |
+
#loc123 = loc("xmask"(#loc7))
|
| 336 |
+
#loc124 = loc("r0_base"(#loc8))
|
| 337 |
+
#loc125 = loc("r0_base"(#loc9))
|
| 338 |
+
#loc126 = loc("x1"(#loc10))
|
| 339 |
+
#loc127 = loc("x1"(#loc11))
|
| 340 |
+
#loc128 = loc("x0"(#loc12))
|
| 341 |
+
#loc129 = loc("x2"(#loc13))
|
| 342 |
+
#loc130 = loc("_tmp46"(#loc14))
|
| 343 |
+
#loc131 = loc("_tmp46"(#loc15))
|
| 344 |
+
#loc132 = loc("r0_index"(#loc16))
|
| 345 |
+
#loc133 = loc("r0_mask"(#loc17))
|
| 346 |
+
#loc134 = loc("r0_4"(#loc18))
|
| 347 |
+
#loc135 = loc("r0_3"(#loc19))
|
| 348 |
+
#loc136 = loc("tmp0"(#loc20))
|
| 349 |
+
#loc137 = loc("tmp0"(#loc21))
|
| 350 |
+
#loc138 = loc("tmp2"(#loc22))
|
| 351 |
+
#loc139 = loc("tmp3"(#loc23))
|
| 352 |
+
#loc140 = loc("tmp3"(#loc24))
|
| 353 |
+
#loc141 = loc("tmp5"(#loc25))
|
| 354 |
+
#loc142 = loc("tmp6"(#loc26))
|
| 355 |
+
#loc143 = loc("tmp7"(#loc27))
|
| 356 |
+
#loc144 = loc("tmp7"(#loc28))
|
| 357 |
+
#loc145 = loc("tmp8"(#loc29))
|
| 358 |
+
#loc146 = loc("tmp8"(#loc30))
|
| 359 |
+
#loc147 = loc("tmp9"(#loc31))
|
| 360 |
+
#loc148 = loc("tmp10"(#loc32))
|
| 361 |
+
#loc149 = loc("tmp10"(#loc33))
|
| 362 |
+
#loc150 = loc("tmp10"(#loc34))
|
| 363 |
+
#loc151 = loc("tmp10"(#loc35))
|
| 364 |
+
#loc152 = loc("tmp10"(#loc36))
|
| 365 |
+
#loc153 = loc("tmp11"(#loc37))
|
| 366 |
+
#loc154 = loc("tmp12"(#loc38))
|
| 367 |
+
#loc155 = loc("tmp13"(#loc39))
|
| 368 |
+
#loc156 = loc("tmp14"(#loc40))
|
| 369 |
+
#loc157 = loc("tmp15"(#loc41))
|
| 370 |
+
#loc158 = loc("tmp16"(#loc42))
|
| 371 |
+
#loc159 = loc("tmp17"(#loc43))
|
| 372 |
+
#loc160 = loc("tmp18"(#loc44))
|
| 373 |
+
#loc161 = loc("tmp19"(#loc45))
|
| 374 |
+
#loc162 = loc("tmp20"(#loc46))
|
| 375 |
+
#loc163 = loc("tmp21"(#loc47))
|
| 376 |
+
#loc164 = loc("tmp22"(#loc48))
|
| 377 |
+
#loc165 = loc("tmp23"(#loc49))
|
| 378 |
+
#loc166 = loc("tmp24"(#loc50))
|
| 379 |
+
#loc167 = loc("tmp25"(#loc51))
|
| 380 |
+
#loc168 = loc("tmp26"(#loc52))
|
| 381 |
+
#loc169 = loc("tmp27"(#loc53))
|
| 382 |
+
#loc170 = loc("tmp28"(#loc54))
|
| 383 |
+
#loc171 = loc("tmp29"(#loc55))
|
| 384 |
+
#loc172 = loc("tmp30"(#loc56))
|
| 385 |
+
#loc173 = loc("tmp30"(#loc57))
|
| 386 |
+
#loc174 = loc("tmp30"(#loc58))
|
| 387 |
+
#loc175 = loc("tmp30"(#loc59))
|
| 388 |
+
#loc176 = loc("tmp30"(#loc60))
|
| 389 |
+
#loc177 = loc("tmp30"(#loc61))
|
| 390 |
+
#loc178 = loc("tmp31"(#loc62))
|
| 391 |
+
#loc179 = loc("tmp32"(#loc63))
|
| 392 |
+
#loc180 = loc("tmp33"(#loc64))
|
| 393 |
+
#loc181 = loc("tmp34"(#loc65))
|
| 394 |
+
#loc182 = loc("tmp35"(#loc66))
|
| 395 |
+
#loc183 = loc("tmp36"(#loc67))
|
| 396 |
+
#loc184 = loc("tmp37"(#loc68))
|
| 397 |
+
#loc185 = loc("tmp38"(#loc69))
|
| 398 |
+
#loc186 = loc("tmp39"(#loc70))
|
| 399 |
+
#loc187 = loc("tmp40"(#loc71))
|
| 400 |
+
#loc188 = loc("tmp41"(#loc72))
|
| 401 |
+
#loc189 = loc("tmp42"(#loc73))
|
| 402 |
+
#loc190 = loc("tmp43"(#loc74))
|
| 403 |
+
#loc191 = loc("tmp44"(#loc75))
|
| 404 |
+
#loc192 = loc("tmp47"(#loc76))
|
| 405 |
+
#loc193 = loc("_tmp46"(#loc77))
|
| 406 |
+
#loc194 = loc("_tmp46"(#loc78))
|
| 407 |
+
#loc195 = loc("tmp46"(#loc80))
|
| 408 |
+
#loc196 = loc("tmp46"(#loc81))
|
| 409 |
+
#loc197 = loc("tmp48"(#loc82))
|
| 410 |
+
#loc198 = loc("tmp49"(#loc83))
|
| 411 |
+
#loc199 = loc("tmp50"(#loc84))
|
| 412 |
+
#loc200 = loc("tmp51"(#loc85))
|
| 413 |
+
#loc201 = loc("tmp52"(#loc86))
|
| 414 |
+
#loc202 = loc("tmp53"(#loc87))
|
| 415 |
+
#loc203 = loc("tmp54"(#loc88))
|
| 416 |
+
#loc204 = loc("tmp55"(#loc89))
|
| 417 |
+
#loc205 = loc("tmp56"(#loc90))
|
| 418 |
+
#loc206 = loc("tmp57"(#loc91))
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}>
|
| 2 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":18:0)
|
| 3 |
+
#loc1 = loc(unknown)
|
| 4 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:27)
|
| 5 |
+
#loc79 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc80 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc81 = loc("out_ptr2"(#loc))
|
| 8 |
+
#loc82 = loc("ks0"(#loc))
|
| 9 |
+
#loc83 = loc("ks1"(#loc))
|
| 10 |
+
#loc84 = loc("ks2"(#loc))
|
| 11 |
+
#loc85 = loc("ks3"(#loc))
|
| 12 |
+
#loc86 = loc("ks4"(#loc))
|
| 13 |
+
#loc87 = loc("ks5"(#loc))
|
| 14 |
+
#loc88 = loc("xnumel"(#loc))
|
| 15 |
+
#loc89 = loc("r0_numel"(#loc))
|
| 16 |
+
#loc149 = loc("tmp46"(#loc63))
|
| 17 |
+
#loc164 = loc(callsite(#loc1 at #loc149))
|
| 18 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 19 |
+
tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 20 |
+
%cst = arith.constant dense<128> : tensor<1x1024xi32, #blocked> loc(#loc1)
|
| 21 |
+
%cst_0 = arith.constant dense<16384> : tensor<1x1024xi32, #blocked> loc(#loc1)
|
| 22 |
+
%c-128_i64 = arith.constant -128 : i64 loc(#loc1)
|
| 23 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 24 |
+
%c128_i64 = arith.constant 128 : i64 loc(#loc1)
|
| 25 |
+
%c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
|
| 26 |
+
%c16384_i32 = arith.constant 16384 : i32 loc(#loc1)
|
| 27 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 28 |
+
%cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1)
|
| 29 |
+
%cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1)
|
| 30 |
+
%cst_3 = arith.constant dense<false> : tensor<1x1024xi1, #blocked> loc(#loc1)
|
| 31 |
+
%cst_4 = arith.constant dense<0> : tensor<1x1024xi64, #blocked> loc(#loc1)
|
| 32 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc90)
|
| 33 |
+
%xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91)
|
| 34 |
+
%r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92)
|
| 35 |
+
%r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x1024xi32, #blocked> loc(#loc92)
|
| 36 |
+
%x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93)
|
| 37 |
+
%x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93)
|
| 38 |
+
%x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94)
|
| 39 |
+
%x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95)
|
| 40 |
+
%x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96)
|
| 41 |
+
%tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97)
|
| 42 |
+
%tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc159)
|
| 43 |
+
%tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc99)
|
| 44 |
+
%tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100)
|
| 45 |
+
%tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc160)
|
| 46 |
+
%tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc102)
|
| 47 |
+
%tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr<i64>, i64 loc(#loc103)
|
| 48 |
+
%tmp10_10 = tt.splat %xmask : i1 -> tensor<1x1024xi1, #blocked> loc(#loc161)
|
| 49 |
+
%tmp10_11 = tt.splat %tmp10 : !tt.ptr<i64> -> tensor<1x1024x!tt.ptr<i64>, #blocked> loc(#loc105)
|
| 50 |
+
%tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc106)
|
| 51 |
+
%tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107)
|
| 52 |
+
%tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1, #blocked> loc(#loc107)
|
| 53 |
+
%tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108)
|
| 54 |
+
%tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc162)
|
| 55 |
+
%_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x1024xi64, #blocked>) : i32 {
|
| 56 |
+
%r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x1024xi32, #blocked> loc(#loc111)
|
| 57 |
+
%r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32, #blocked> loc(#loc111)
|
| 58 |
+
%r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x1024xi32, #blocked> loc(#loc112)
|
| 59 |
+
%r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc113)
|
| 60 |
+
%r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc114)
|
| 61 |
+
%tmp0_17 = arith.extsi %r0_4 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc98)
|
| 62 |
+
%tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x1024xi64, #blocked> loc(#loc98)
|
| 63 |
+
%tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x1024xi64, #blocked> loc(#loc99)
|
| 64 |
+
%tmp3_20 = arith.extsi %r0_3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc101)
|
| 65 |
+
%tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc101)
|
| 66 |
+
%tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x1024xi64, #blocked> loc(#loc102)
|
| 67 |
+
%tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x1024xi1, #blocked> loc(#loc115)
|
| 68 |
+
%tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x1024xi64, #blocked> loc(#loc116)
|
| 69 |
+
%tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1, #blocked> loc(#loc117)
|
| 70 |
+
%tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc104)
|
| 71 |
+
%tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr<i64>, #blocked> loc(#loc105)
|
| 72 |
+
%tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc118)
|
| 73 |
+
%tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc119)
|
| 74 |
+
%tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1, #blocked> loc(#loc120)
|
| 75 |
+
%tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1, #blocked> loc(#loc121)
|
| 76 |
+
%tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc122)
|
| 77 |
+
%tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc123)
|
| 78 |
+
%tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc124)
|
| 79 |
+
%tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc125)
|
| 80 |
+
%tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc126)
|
| 81 |
+
%tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1, #blocked> loc(#loc127)
|
| 82 |
+
%tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc128)
|
| 83 |
+
%tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc129)
|
| 84 |
+
%tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc130)
|
| 85 |
+
%tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1, #blocked> loc(#loc131)
|
| 86 |
+
%tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32, #blocked> loc(#loc132)
|
| 87 |
+
%tmp30_27 = arith.extsi %tmp30_26 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc109)
|
| 88 |
+
%tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x1024xi64, #blocked> loc(#loc109)
|
| 89 |
+
%tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc133)
|
| 90 |
+
%tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc134)
|
| 91 |
+
%tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc135)
|
| 92 |
+
%tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc136)
|
| 93 |
+
%tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc137)
|
| 94 |
+
%tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1, #blocked> loc(#loc138)
|
| 95 |
+
%tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc139)
|
| 96 |
+
%tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc140)
|
| 97 |
+
%tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc141)
|
| 98 |
+
%tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1, #blocked> loc(#loc142)
|
| 99 |
+
%tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1, #blocked> loc(#loc143)
|
| 100 |
+
%tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi1, #blocked> loc(#loc144)
|
| 101 |
+
%tmp44 = arith.extui %tmp43 : tensor<1x1024xi1, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc145)
|
| 102 |
+
%tmp47 = arith.addi %arg12, %tmp44 : tensor<1x1024xi64, #blocked> loc(#loc146)
|
| 103 |
+
%_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc147)
|
| 104 |
+
%_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc148)
|
| 105 |
+
scf.yield %_tmp46_31 : tensor<1x1024xi64, #blocked> loc(#loc61)
|
| 106 |
+
} loc(#loc110)
|
| 107 |
+
%tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({
|
| 108 |
+
^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))):
|
| 109 |
+
%tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167)
|
| 110 |
+
tt.reduce.return %tmp46_17 : i64 loc(#loc163)
|
| 111 |
+
}) : (tensor<1x1024xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163)
|
| 112 |
+
%tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150)
|
| 113 |
+
%tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151)
|
| 114 |
+
%tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152)
|
| 115 |
+
%tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153)
|
| 116 |
+
%tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165)
|
| 117 |
+
%tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156)
|
| 118 |
+
%tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166)
|
| 119 |
+
%0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc74)
|
| 120 |
+
%1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc75)
|
| 121 |
+
%2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75)
|
| 122 |
+
tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc75)
|
| 123 |
+
%3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr<i32>, i32 loc(#loc76)
|
| 124 |
+
%4 = tt.splat %3 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc77)
|
| 125 |
+
tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc77)
|
| 126 |
+
tt.return loc(#loc78)
|
| 127 |
+
} loc(#loc)
|
| 128 |
+
} loc(#loc)
|
| 129 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:28)
|
| 130 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":24:21)
|
| 131 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:37)
|
| 132 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:21)
|
| 133 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:28)
|
| 134 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":28:19)
|
| 135 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":29:19)
|
| 136 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:26)
|
| 137 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:22)
|
| 138 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":41:22)
|
| 139 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:26)
|
| 140 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:22)
|
| 141 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":44:22)
|
| 142 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:35)
|
| 143 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:94)
|
| 144 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:77)
|
| 145 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":56:37)
|
| 146 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":62:92)
|
| 147 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:45)
|
| 148 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:38)
|
| 149 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":32:40)
|
| 150 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":33:31)
|
| 151 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":34:29)
|
| 152 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":37:27)
|
| 153 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":38:27)
|
| 154 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":45:22)
|
| 155 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":48:23)
|
| 156 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:87)
|
| 157 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":50:23)
|
| 158 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":51:23)
|
| 159 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":52:24)
|
| 160 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":53:23)
|
| 161 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":57:24)
|
| 162 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":58:24)
|
| 163 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":60:25)
|
| 164 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":61:92)
|
| 165 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":63:25)
|
| 166 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":64:24)
|
| 167 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":65:24)
|
| 168 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":66:39)
|
| 169 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":67:24)
|
| 170 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":68:24)
|
| 171 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:24)
|
| 172 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:51)
|
| 173 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":70:25)
|
| 174 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":71:25)
|
| 175 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":72:92)
|
| 176 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":73:25)
|
| 177 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":74:24)
|
| 178 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":75:24)
|
| 179 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":76:39)
|
| 180 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":78:25)
|
| 181 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":79:24)
|
| 182 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":80:24)
|
| 183 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":82:38)
|
| 184 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":83:25)
|
| 185 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":85:25)
|
| 186 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:36)
|
| 187 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:50)
|
| 188 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:8)
|
| 189 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 190 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 191 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:30)
|
| 192 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":89:20)
|
| 193 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":91:20)
|
| 194 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":92:20)
|
| 195 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":94:21)
|
| 196 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":93:21)
|
| 197 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":95:21)
|
| 198 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":97:21)
|
| 199 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":96:21)
|
| 200 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:25)
|
| 201 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:37)
|
| 202 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:25)
|
| 203 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:37)
|
| 204 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:4)
|
| 205 |
+
#loc90 = loc("xoffset"(#loc2))
|
| 206 |
+
#loc91 = loc("xmask"(#loc3))
|
| 207 |
+
#loc92 = loc("r0_base"(#loc4))
|
| 208 |
+
#loc93 = loc("x1"(#loc5))
|
| 209 |
+
#loc94 = loc("x1"(#loc6))
|
| 210 |
+
#loc95 = loc("x0"(#loc7))
|
| 211 |
+
#loc96 = loc("x2"(#loc8))
|
| 212 |
+
#loc97 = loc("tmp0"(#loc9))
|
| 213 |
+
#loc98 = loc("tmp0"(#loc10))
|
| 214 |
+
#loc99 = loc("tmp2"(#loc11))
|
| 215 |
+
#loc100 = loc("tmp3"(#loc12))
|
| 216 |
+
#loc101 = loc("tmp3"(#loc13))
|
| 217 |
+
#loc102 = loc("tmp5"(#loc14))
|
| 218 |
+
#loc103 = loc("tmp10"(#loc15))
|
| 219 |
+
#loc104 = loc("tmp10"(#loc16))
|
| 220 |
+
#loc105 = loc("tmp10"(#loc17))
|
| 221 |
+
#loc106 = loc("tmp17"(#loc18))
|
| 222 |
+
#loc107 = loc("tmp23"(#loc19))
|
| 223 |
+
#loc108 = loc("tmp30"(#loc20))
|
| 224 |
+
#loc109 = loc("tmp30"(#loc21))
|
| 225 |
+
#loc110 = loc("_tmp46"(#loc22))
|
| 226 |
+
#loc111 = loc("r0_index"(#loc23))
|
| 227 |
+
#loc112 = loc("r0_mask"(#loc24))
|
| 228 |
+
#loc113 = loc("r0_4"(#loc25))
|
| 229 |
+
#loc114 = loc("r0_3"(#loc26))
|
| 230 |
+
#loc115 = loc("tmp6"(#loc27))
|
| 231 |
+
#loc116 = loc("tmp9"(#loc28))
|
| 232 |
+
#loc117 = loc("tmp10"(#loc29))
|
| 233 |
+
#loc118 = loc("tmp11"(#loc30))
|
| 234 |
+
#loc119 = loc("tmp12"(#loc31))
|
| 235 |
+
#loc120 = loc("tmp13"(#loc32))
|
| 236 |
+
#loc121 = loc("tmp14"(#loc33))
|
| 237 |
+
#loc122 = loc("tmp18"(#loc34))
|
| 238 |
+
#loc123 = loc("tmp19"(#loc35))
|
| 239 |
+
#loc124 = loc("tmp21"(#loc36))
|
| 240 |
+
#loc125 = loc("tmp22"(#loc37))
|
| 241 |
+
#loc126 = loc("tmp24"(#loc38))
|
| 242 |
+
#loc127 = loc("tmp25"(#loc39))
|
| 243 |
+
#loc128 = loc("tmp26"(#loc40))
|
| 244 |
+
#loc129 = loc("tmp27"(#loc41))
|
| 245 |
+
#loc130 = loc("tmp28"(#loc42))
|
| 246 |
+
#loc131 = loc("tmp29"(#loc43))
|
| 247 |
+
#loc132 = loc("tmp30"(#loc44))
|
| 248 |
+
#loc133 = loc("tmp30"(#loc45))
|
| 249 |
+
#loc134 = loc("tmp31"(#loc46))
|
| 250 |
+
#loc135 = loc("tmp32"(#loc47))
|
| 251 |
+
#loc136 = loc("tmp33"(#loc48))
|
| 252 |
+
#loc137 = loc("tmp34"(#loc49))
|
| 253 |
+
#loc138 = loc("tmp35"(#loc50))
|
| 254 |
+
#loc139 = loc("tmp36"(#loc51))
|
| 255 |
+
#loc140 = loc("tmp37"(#loc52))
|
| 256 |
+
#loc141 = loc("tmp39"(#loc53))
|
| 257 |
+
#loc142 = loc("tmp40"(#loc54))
|
| 258 |
+
#loc143 = loc("tmp41"(#loc55))
|
| 259 |
+
#loc144 = loc("tmp43"(#loc56))
|
| 260 |
+
#loc145 = loc("tmp44"(#loc57))
|
| 261 |
+
#loc146 = loc("tmp47"(#loc58))
|
| 262 |
+
#loc147 = loc("_tmp46"(#loc59))
|
| 263 |
+
#loc148 = loc("_tmp46"(#loc60))
|
| 264 |
+
#loc150 = loc("tmp46"(#loc65))
|
| 265 |
+
#loc151 = loc("tmp49"(#loc66))
|
| 266 |
+
#loc152 = loc("tmp51"(#loc67))
|
| 267 |
+
#loc153 = loc("tmp52"(#loc68))
|
| 268 |
+
#loc154 = loc("tmp54"(#loc69))
|
| 269 |
+
#loc155 = loc("tmp53"(#loc70))
|
| 270 |
+
#loc156 = loc("tmp55"(#loc71))
|
| 271 |
+
#loc157 = loc("tmp57"(#loc72))
|
| 272 |
+
#loc158 = loc("tmp56"(#loc73))
|
| 273 |
+
#loc159 = loc(fused[#loc98, #loc97])
|
| 274 |
+
#loc160 = loc(fused[#loc101, #loc100])
|
| 275 |
+
#loc161 = loc(fused[#loc104, #loc91])
|
| 276 |
+
#loc162 = loc(fused[#loc109, #loc108])
|
| 277 |
+
#loc163 = loc(callsite(#loc62 at #loc149))
|
| 278 |
+
#loc165 = loc(fused[#loc154, #loc155])
|
| 279 |
+
#loc166 = loc(fused[#loc157, #loc158])
|
| 280 |
+
#loc167 = loc(callsite(#loc64 at #loc163))
|
SpecForge-ext/cache/compiled_kernels/triton/7/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:27)
|
| 4 |
+
#loc81 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc82 = loc("out_ptr1"(#loc))
|
| 6 |
+
#loc83 = loc("out_ptr2"(#loc))
|
| 7 |
+
#loc84 = loc("ks0"(#loc))
|
| 8 |
+
#loc85 = loc("ks1"(#loc))
|
| 9 |
+
#loc86 = loc("ks2"(#loc))
|
| 10 |
+
#loc87 = loc("ks3"(#loc))
|
| 11 |
+
#loc88 = loc("ks4"(#loc))
|
| 12 |
+
#loc89 = loc("ks5"(#loc))
|
| 13 |
+
#loc90 = loc("xnumel"(#loc))
|
| 14 |
+
#loc91 = loc("r0_numel"(#loc))
|
| 15 |
+
#loc153 = loc("tmp46"(#loc65))
|
| 16 |
+
#loc168 = loc(callsite(#loc1 at #loc153))
|
| 17 |
+
module {
|
| 18 |
+
tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 19 |
+
%c-128_i64 = arith.constant -128 : i64 loc(#loc1)
|
| 20 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 21 |
+
%c128_i64 = arith.constant 128 : i64 loc(#loc1)
|
| 22 |
+
%c1024_i32 = arith.constant 1024 : i32 loc(#loc2)
|
| 23 |
+
%c16384_i32 = arith.constant 16384 : i32 loc(#loc2)
|
| 24 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc2)
|
| 25 |
+
%tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92)
|
| 26 |
+
%cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1)
|
| 27 |
+
%cst_0 = arith.constant dense<false> : tensor<1x1024xi1> loc(#loc1)
|
| 28 |
+
%cst_1 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc1)
|
| 29 |
+
%cst_2 = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc1)
|
| 30 |
+
%cst_3 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc1)
|
| 31 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc93)
|
| 32 |
+
%xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94)
|
| 33 |
+
%xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94)
|
| 34 |
+
%r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc95)
|
| 35 |
+
%r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc96)
|
| 36 |
+
%x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97)
|
| 37 |
+
%x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97)
|
| 38 |
+
%x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98)
|
| 39 |
+
%x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99)
|
| 40 |
+
%x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100)
|
| 41 |
+
%_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x1024xi64>) : i32 {
|
| 42 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc102)
|
| 43 |
+
%r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32> loc(#loc102)
|
| 44 |
+
%r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x1024xi32> loc(#loc103)
|
| 45 |
+
%r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc104)
|
| 46 |
+
%r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc105)
|
| 47 |
+
%tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106)
|
| 48 |
+
%tmp0_11 = arith.extsi %r0_4 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc107)
|
| 49 |
+
%tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64> loc(#loc163)
|
| 50 |
+
%tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x1024xi64> loc(#loc107)
|
| 51 |
+
%tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc108)
|
| 52 |
+
%tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x1024xi64> loc(#loc108)
|
| 53 |
+
%tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109)
|
| 54 |
+
%tmp3_15 = arith.extsi %r0_3 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc110)
|
| 55 |
+
%tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64> loc(#loc164)
|
| 56 |
+
%tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x1024xi64> loc(#loc110)
|
| 57 |
+
%tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc111)
|
| 58 |
+
%tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x1024xi64> loc(#loc111)
|
| 59 |
+
%tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x1024xi1> loc(#loc112)
|
| 60 |
+
%tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x1024xi64> loc(#loc113)
|
| 61 |
+
%tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr<i64>, i64 loc(#loc114)
|
| 62 |
+
%tmp10_19 = tt.splat %tmp10 : !tt.ptr<i64> -> tensor<1x1024x!tt.ptr<i64>> loc(#loc114)
|
| 63 |
+
%tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1> loc(#loc115)
|
| 64 |
+
%tmp10_21 = tt.splat %xmask : i1 -> tensor<1x1024xi1> loc(#loc165)
|
| 65 |
+
%tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x1024xi1> loc(#loc116)
|
| 66 |
+
%tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr<i64>> loc(#loc117)
|
| 67 |
+
%tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x1024xi64> loc(#loc118)
|
| 68 |
+
%tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x1024xi64> loc(#loc119)
|
| 69 |
+
%tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc120)
|
| 70 |
+
%tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc121)
|
| 71 |
+
%tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc122)
|
| 72 |
+
%tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc123)
|
| 73 |
+
%tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc124)
|
| 74 |
+
%tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc125)
|
| 75 |
+
%tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc126)
|
| 76 |
+
%tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127)
|
| 77 |
+
%tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1> loc(#loc127)
|
| 78 |
+
%tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x1024xi1> loc(#loc128)
|
| 79 |
+
%tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1> loc(#loc129)
|
| 80 |
+
%tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc130)
|
| 81 |
+
%tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc131)
|
| 82 |
+
%tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x1024xi64> loc(#loc132)
|
| 83 |
+
%tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc133)
|
| 84 |
+
%tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32> loc(#loc134)
|
| 85 |
+
%tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135)
|
| 86 |
+
%tmp30_26 = arith.extsi %tmp30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc136)
|
| 87 |
+
%tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x1024xi64> loc(#loc166)
|
| 88 |
+
%tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x1024xi64> loc(#loc136)
|
| 89 |
+
%tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x1024xi64> loc(#loc137)
|
| 90 |
+
%tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64> loc(#loc138)
|
| 91 |
+
%tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc139)
|
| 92 |
+
%tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc140)
|
| 93 |
+
%tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x1024xi1> loc(#loc141)
|
| 94 |
+
%tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1> loc(#loc142)
|
| 95 |
+
%tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc143)
|
| 96 |
+
%tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc144)
|
| 97 |
+
%tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x1024xi64> loc(#loc145)
|
| 98 |
+
%tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1> loc(#loc146)
|
| 99 |
+
%tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1> loc(#loc147)
|
| 100 |
+
%tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc148)
|
| 101 |
+
%tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc149)
|
| 102 |
+
%tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x1024xi64> loc(#loc150)
|
| 103 |
+
%_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x1024xi1> loc(#loc151)
|
| 104 |
+
%_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc152)
|
| 105 |
+
scf.yield %_tmp46_31 : tensor<1x1024xi64> loc(#loc63)
|
| 106 |
+
} loc(#loc101)
|
| 107 |
+
%tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({
|
| 108 |
+
^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))):
|
| 109 |
+
%tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171)
|
| 110 |
+
tt.reduce.return %tmp46_11 : i64 loc(#loc167)
|
| 111 |
+
}) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc167)
|
| 112 |
+
%tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154)
|
| 113 |
+
%tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155)
|
| 114 |
+
%tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156)
|
| 115 |
+
%tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157)
|
| 116 |
+
%tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169)
|
| 117 |
+
%tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160)
|
| 118 |
+
%tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170)
|
| 119 |
+
%0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc76)
|
| 120 |
+
%1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc76)
|
| 121 |
+
tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr<i32>> loc(#loc77)
|
| 122 |
+
%2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr<i32>, i32 loc(#loc78)
|
| 123 |
+
%3 = tt.splat %2 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc78)
|
| 124 |
+
tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr<i32>> loc(#loc79)
|
| 125 |
+
tt.return loc(#loc80)
|
| 126 |
+
} loc(#loc)
|
| 127 |
+
} loc(#loc)
|
| 128 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":32:40)
|
| 129 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":90:35)
|
| 130 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":22:28)
|
| 131 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":24:21)
|
| 132 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:27)
|
| 133 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":25:37)
|
| 134 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:21)
|
| 135 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":27:28)
|
| 136 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":28:19)
|
| 137 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":29:19)
|
| 138 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":33:31)
|
| 139 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":34:29)
|
| 140 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":37:27)
|
| 141 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":38:27)
|
| 142 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:26)
|
| 143 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":39:22)
|
| 144 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":41:22)
|
| 145 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:26)
|
| 146 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":42:22)
|
| 147 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":44:22)
|
| 148 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":45:22)
|
| 149 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":48:23)
|
| 150 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:35)
|
| 151 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:87)
|
| 152 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:94)
|
| 153 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":49:77)
|
| 154 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":50:23)
|
| 155 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":51:23)
|
| 156 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":52:24)
|
| 157 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":53:23)
|
| 158 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":56:37)
|
| 159 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":57:24)
|
| 160 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":58:24)
|
| 161 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":60:25)
|
| 162 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":61:92)
|
| 163 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":62:92)
|
| 164 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":63:25)
|
| 165 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":64:24)
|
| 166 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":65:24)
|
| 167 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":66:39)
|
| 168 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":67:24)
|
| 169 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":68:24)
|
| 170 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:24)
|
| 171 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:45)
|
| 172 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:38)
|
| 173 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":69:51)
|
| 174 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":70:25)
|
| 175 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":71:25)
|
| 176 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":72:92)
|
| 177 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":73:25)
|
| 178 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":74:24)
|
| 179 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":75:24)
|
| 180 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":76:39)
|
| 181 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":78:25)
|
| 182 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":79:24)
|
| 183 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":80:24)
|
| 184 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":82:38)
|
| 185 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":83:25)
|
| 186 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":85:25)
|
| 187 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:36)
|
| 188 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:50)
|
| 189 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":86:8)
|
| 190 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 191 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 192 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":87:30)
|
| 193 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":89:20)
|
| 194 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":91:20)
|
| 195 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":92:20)
|
| 196 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":94:21)
|
| 197 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":93:21)
|
| 198 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":95:21)
|
| 199 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":97:21)
|
| 200 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":96:21)
|
| 201 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:25)
|
| 202 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":98:37)
|
| 203 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:25)
|
| 204 |
+
#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:37)
|
| 205 |
+
#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/jx/cjxtezx44vmqh622f3tpmaklof56br4eylt3nz4a46kavvz2gwqw.py":99:4)
|
| 206 |
+
#loc92 = loc("tmp50"(#loc3))
|
| 207 |
+
#loc93 = loc("xoffset"(#loc4))
|
| 208 |
+
#loc94 = loc("xmask"(#loc5))
|
| 209 |
+
#loc95 = loc("r0_base"(#loc6))
|
| 210 |
+
#loc96 = loc("r0_base"(#loc7))
|
| 211 |
+
#loc97 = loc("x1"(#loc8))
|
| 212 |
+
#loc98 = loc("x1"(#loc9))
|
| 213 |
+
#loc99 = loc("x0"(#loc10))
|
| 214 |
+
#loc100 = loc("x2"(#loc11))
|
| 215 |
+
#loc101 = loc("_tmp46"(#loc2))
|
| 216 |
+
#loc102 = loc("r0_index"(#loc12))
|
| 217 |
+
#loc103 = loc("r0_mask"(#loc13))
|
| 218 |
+
#loc104 = loc("r0_4"(#loc14))
|
| 219 |
+
#loc105 = loc("r0_3"(#loc15))
|
| 220 |
+
#loc106 = loc("tmp0"(#loc16))
|
| 221 |
+
#loc107 = loc("tmp0"(#loc17))
|
| 222 |
+
#loc108 = loc("tmp2"(#loc18))
|
| 223 |
+
#loc109 = loc("tmp3"(#loc19))
|
| 224 |
+
#loc110 = loc("tmp3"(#loc20))
|
| 225 |
+
#loc111 = loc("tmp5"(#loc21))
|
| 226 |
+
#loc112 = loc("tmp6"(#loc22))
|
| 227 |
+
#loc113 = loc("tmp9"(#loc23))
|
| 228 |
+
#loc114 = loc("tmp10"(#loc24))
|
| 229 |
+
#loc115 = loc("tmp10"(#loc25))
|
| 230 |
+
#loc116 = loc("tmp10"(#loc26))
|
| 231 |
+
#loc117 = loc("tmp10"(#loc27))
|
| 232 |
+
#loc118 = loc("tmp11"(#loc28))
|
| 233 |
+
#loc119 = loc("tmp12"(#loc29))
|
| 234 |
+
#loc120 = loc("tmp13"(#loc30))
|
| 235 |
+
#loc121 = loc("tmp14"(#loc31))
|
| 236 |
+
#loc122 = loc("tmp17"(#loc32))
|
| 237 |
+
#loc123 = loc("tmp18"(#loc33))
|
| 238 |
+
#loc124 = loc("tmp19"(#loc34))
|
| 239 |
+
#loc125 = loc("tmp21"(#loc35))
|
| 240 |
+
#loc126 = loc("tmp22"(#loc36))
|
| 241 |
+
#loc127 = loc("tmp23"(#loc37))
|
| 242 |
+
#loc128 = loc("tmp24"(#loc38))
|
| 243 |
+
#loc129 = loc("tmp25"(#loc39))
|
| 244 |
+
#loc130 = loc("tmp26"(#loc40))
|
| 245 |
+
#loc131 = loc("tmp27"(#loc41))
|
| 246 |
+
#loc132 = loc("tmp28"(#loc42))
|
| 247 |
+
#loc133 = loc("tmp29"(#loc43))
|
| 248 |
+
#loc134 = loc("tmp30"(#loc44))
|
| 249 |
+
#loc135 = loc("tmp30"(#loc45))
|
| 250 |
+
#loc136 = loc("tmp30"(#loc46))
|
| 251 |
+
#loc137 = loc("tmp30"(#loc47))
|
| 252 |
+
#loc138 = loc("tmp31"(#loc48))
|
| 253 |
+
#loc139 = loc("tmp32"(#loc49))
|
| 254 |
+
#loc140 = loc("tmp33"(#loc50))
|
| 255 |
+
#loc141 = loc("tmp34"(#loc51))
|
| 256 |
+
#loc142 = loc("tmp35"(#loc52))
|
| 257 |
+
#loc143 = loc("tmp36"(#loc53))
|
| 258 |
+
#loc144 = loc("tmp37"(#loc54))
|
| 259 |
+
#loc145 = loc("tmp39"(#loc55))
|
| 260 |
+
#loc146 = loc("tmp40"(#loc56))
|
| 261 |
+
#loc147 = loc("tmp41"(#loc57))
|
| 262 |
+
#loc148 = loc("tmp43"(#loc58))
|
| 263 |
+
#loc149 = loc("tmp44"(#loc59))
|
| 264 |
+
#loc150 = loc("tmp47"(#loc60))
|
| 265 |
+
#loc151 = loc("_tmp46"(#loc61))
|
| 266 |
+
#loc152 = loc("_tmp46"(#loc62))
|
| 267 |
+
#loc154 = loc("tmp46"(#loc67))
|
| 268 |
+
#loc155 = loc("tmp49"(#loc68))
|
| 269 |
+
#loc156 = loc("tmp51"(#loc69))
|
| 270 |
+
#loc157 = loc("tmp52"(#loc70))
|
| 271 |
+
#loc158 = loc("tmp54"(#loc71))
|
| 272 |
+
#loc159 = loc("tmp53"(#loc72))
|
| 273 |
+
#loc160 = loc("tmp55"(#loc73))
|
| 274 |
+
#loc161 = loc("tmp57"(#loc74))
|
| 275 |
+
#loc162 = loc("tmp56"(#loc75))
|
| 276 |
+
#loc163 = loc(fused[#loc107, #loc106])
|
| 277 |
+
#loc164 = loc(fused[#loc110, #loc109])
|
| 278 |
+
#loc165 = loc(fused[#loc116, #loc94])
|
| 279 |
+
#loc166 = loc(fused[#loc136, #loc135])
|
| 280 |
+
#loc167 = loc(callsite(#loc64 at #loc153))
|
| 281 |
+
#loc169 = loc(fused[#loc158, #loc159])
|
| 282 |
+
#loc170 = loc(fused[#loc161, #loc162])
|
| 283 |
+
#loc171 = loc(callsite(#loc66 at #loc167))
|
SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}}
|