Lekr0 commited on 11 days ago

Commit

62dca4c

verified ·

1 Parent(s): 0b9402c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

progress/SpecForge/.devcontainer/Dockerfile +32 -0
progress/SpecForge/.devcontainer/devcontainer.json +30 -0
progress/SpecForge/.github/CODEOWNERS +11 -0
progress/SpecForge/.github/pull_request_template.md +30 -0
progress/SpecForge/assets/logo.svg +0 -0
progress/SpecForge/benchmarks/README.md +67 -0
progress/SpecForge/benchmarks/__init__.py +3 -0
progress/SpecForge/benchmarks/bench_eagle3.py +268 -0
progress/SpecForge/benchmarks/benchmarker/__init__.py +29 -0
progress/SpecForge/benchmarks/benchmarker/aime.py +133 -0
progress/SpecForge/benchmarks/benchmarker/base.py +218 -0
progress/SpecForge/benchmarks/benchmarker/ceval.py +267 -0
progress/SpecForge/benchmarks/benchmarker/financeqa.py +59 -0
progress/SpecForge/benchmarks/benchmarker/gpqa.py +85 -0
progress/SpecForge/benchmarks/benchmarker/gsm8k.py +99 -0
progress/SpecForge/benchmarks/benchmarker/humaneval.py +188 -0
progress/SpecForge/benchmarks/benchmarker/livecodebench.py +46 -0
progress/SpecForge/benchmarks/benchmarker/math500.py +122 -0
progress/SpecForge/benchmarks/benchmarker/mmlu.py +82 -0
progress/SpecForge/benchmarks/benchmarker/mmstar.py +185 -0
progress/SpecForge/benchmarks/benchmarker/mtbench.py +59 -0
progress/SpecForge/benchmarks/benchmarker/registry.py +31 -0
progress/SpecForge/benchmarks/benchmarker/simpleqa.py +42 -0
progress/SpecForge/benchmarks/benchmarker/utils.py +273 -0
progress/SpecForge/cache/compiled_kernels/26/c26l7dxpqbfol7d62sqakxdv4rgyh27yhm4hrctevbkw5t6kekia.py +799 -0
progress/SpecForge/cache/compiled_kernels/2d/c2d4e47kqxxnp6455gvkteqq3r336462zkbitosyeko6znxktn2b.py +879 -0
progress/SpecForge/cache/compiled_kernels/2g/c2gswut4q57fp2ueybipg5qfqiy5coitofujwdnvqdwhr7nbvnyq.py +534 -0
progress/SpecForge/cache/compiled_kernels/2j/4b74fa21eaaf86b6290185f6fe50aec9b905d858a087238ceddb52477f3f6acb.best_config +1 -0
progress/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py +28 -0
progress/SpecForge/cache/compiled_kernels/2n/c2ngvuchx6agpdr6v7awl3qgblaehfzaauoxn6camwvtk7syoxsk.py +715 -0
progress/SpecForge/cache/compiled_kernels/2n/c2nooi7ekpz4qvmvghggbegd5cyfspb27jmq2snbi26zbrpoibnx.py +48 -0
progress/SpecForge/cache/compiled_kernels/2n/d17ff4e7bb44e5ae89a267ef332bb7c074804ce0942fc0694c3ef15b05f7854a.best_config +1 -0
progress/SpecForge/cache/compiled_kernels/2o/c2oashzxz74kzyuwo67tuhk32cike37ysabriftachdv7lf2qxgs.py +799 -0
progress/SpecForge/cache/compiled_kernels/2v/c2vob47d7sxpitzmofyr55f5hvxsitxjhpyv5hdiqcdjgbwmxk76.py +799 -0
progress/SpecForge/cache/compiled_kernels/2y/c2yhndikcsebqfmbw7l44gmcdoyw7ogaqt7quyeygz3mp5w6u6ke.py +715 -0
progress/SpecForge/cache/compiled_kernels/2z/c2zdv5arszdl6ednyphqfnib6jwgzomr6zt6536b7gq75kp67uvh.py +1046 -0
progress/SpecForge/cache/compiled_kernels/2z/c2zqq6qyjomc7iflknbqr7yjdhjux47hzv4nnsi5qfbeqglaip2h.py +707 -0
progress/SpecForge/cache/compiled_kernels/32/8d96bbe05a966b7e7756831f09a79e31bf46fad0952af86f36d75557fc1735e8.best_config +1 -0
progress/SpecForge/cache/compiled_kernels/32/c32pbcuz72bjfnkzvckfbbzlzuupc5yxl7t47b3qf74mmk5g2d2z.py +27 -0
progress/SpecForge/cache/compiled_kernels/3b/a0a6b043ab548fdf71e72bbdf5daab7f72e9ed11a9ad9f8824a6263bb6bc5081.best_config +1 -0
progress/SpecForge/cache/compiled_kernels/3b/c3bqw7dk7k6dcdrp3ycrthotye7y6zb26752jl4lwmfgaybpvr6y.py +27 -0
progress/SpecForge/cache/compiled_kernels/3f/3f6057605b157d44fd56f748226a63975b79198f94871188e73e46cd6c7f8792.best_config +1 -0
progress/SpecForge/cache/compiled_kernels/3f/c3fttv7enp2yvnla3r6jkk4galt2qdpxw577ghvkmmx6zqaqla74.py +54 -0
progress/SpecForge/cache/compiled_kernels/3n/c3nlaqknekmjv2zuxzow4rf42v3gorxnfp6uod3dg3ic5ibp6yp3.py +715 -0
progress/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py +51 -0
progress/SpecForge/cache/compiled_kernels/3q/fc5920467dd1501963c976e2b895fc37747fdebfa098fff912209055f3a31828.best_config +1 -0
progress/SpecForge/cache/compiled_kernels/3r/c3rkwwyedldrjz6sidtx5huqcsdgpdpu4xndmm6h4e4boo6cbg2w.py +702 -0
progress/SpecForge/cache/compiled_kernels/3z/c3zi2pt6zmbthc6ythgt5p4ednhp6m24gpscb2pt6adf6xojetua.py +799 -0
progress/SpecForge/cache/compiled_kernels/3z/c3zilfzjywngbdehwphwkhzpt6qcv6jecvzdajl2d5hb73xe6yzw.py +582 -0
progress/SpecForge/cache/compiled_kernels/4a/7887d45b1aa6124e232769adbe995f9cc2af0dd187cb9928540172d82c7b8631.best_config +1 -0

progress/SpecForge/.devcontainer/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM lmsysorg/sglang:dev
+# Create non-root user with specified UID and GID
+# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908.
+ARG HOST_UID=1003
+ARG HOST_GID=1003
+RUN groupadd -g $HOST_GID devuser && \
+    useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser
+# Give devuser sudo access
+RUN apt-get update && apt-get install -y sudo && \
+    echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \
+    rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
+# Set up oh-my-zsh for devuser
+RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \
+    cp /root/.zshrc /home/devuser/.zshrc && \
+    cp /root/.vimrc /home/devuser/.vimrc && \
+    cp /root/.tmux.conf /home/devuser/.tmux.conf && \
+    sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \
+    chown -R devuser:devuser /home/devuser/
+# Set workspace directory and ownership
+WORKDIR /sgl-workspace/sglang
+RUN chown -R devuser:devuser /sgl-workspace
+# Switch to devuser
+USER devuser
+# Install rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

progress/SpecForge/.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "name": "sglang",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "remoteUser": "devuser",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                // Python development
+                "ms-python.python",
+                "charliermarsh.ruff",
+                // Rust development
+                "rust-lang.rust-analyzer",
+                "tamasfe.even-better-toml"
+            ]
+        }
+    },
+    "forwardPorts": [],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    // The two lines below ensures that your local changes in the sglang
+    // repo is automatically synced to the sglang pip package installed
+    // in the dev docker container. You can remove / comment out these
+    // two lines if you prefer to sync code changes manually.
+    "workspaceMount": "source=${localWorkspaceFolder},target=/sgl-workspace/specforge,type=bind",
+    "workspaceFolder": "/sgl-workspace/specforge"
+}

progress/SpecForge/.github/CODEOWNERS ADDED Viewed

	@@ -0,0 +1,11 @@

+.github @FrankLeeeee
+/specforge/core @FrankLeeeee
+/specforge/data @zyksir @sleepcoo @shuaills
+/specforge/layers @FrankLeeeee @FlamingoPg @sleepcoo @shuaills
+/specforge/modeling @FlamingoPg @sleepcoo @shuaills @FrankLeeeee
+/tests @FrankLeeeee
+/assets @FrankLeeeee @zhyncs
+/examples @shuaills @sleepcoo @FlamingoPg
+/configs @FrankLeeeee @FlamingoPg
+/benchmarks @FrankLeeeee
+/scripts @shuaills @sleepcoo @FlamingoPg

progress/SpecForge/.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,30 @@

+<!-- Thank you for your contribution! We appreciate it. The following guidelines will help improve your pull request and facilitate feedback. If anything is unclear, don't hesitate to submit your pull request and ask the maintainers for assistance. -->
+## Motivation
+<!-- Explain the purpose of this PR and the goals it aims to achieve. -->
+## Modifications
+<!-- Describe the changes made in this PR. -->
+## Related Issues
+<!-- Link to any related issues here. e.g. "Fixes #123" or "Closes #456" -->
+## Accuracy Test
+<!-- If this PR affects model-side code (e.g., kernels, model architecture), please provide accuracy test results. Ref: https://docs.sglang.ai/references/accuracy_evaluation.html -->
+## Benchmark & Profiling
+<!-- If this PR is expected to impact performance, please provide benchmark and profiling results. Ref: https://docs.sglang.ai/references/benchmark_and_profiling.html -->
+## Checklist
+- [ ] Format your code according to the [Code Formatting with Pre-Commit](https://docs.sglang.ai/references/contribution_guide.html#code-formatting-with-pre-commit).
+- [ ] Add unit tests as outlined in the [Running Unit Tests](https://docs.sglang.ai/references/contribution_guide.html#running-unit-tests-adding-to-ci).
+- [ ] Update documentation / docstrings / example tutorials as needed, according to [Writing Documentation](https://docs.sglang.ai/references/contribution_guide.html#writing-documentation-running-docs-ci).
+- [ ] Provide throughput / latency benchmark results and accuracy evaluation results as needed, according to [Benchmark and Profiling](https://docs.sglang.ai/references/benchmark_and_profiling.html) and [Accuracy Results](https://docs.sglang.ai/references/accuracy_evaluation.html).
+- [ ] For reviewers: If you haven't made any contributions to this PR and are only assisting with merging the main branch, please remove yourself as a co-author when merging the PR.
+- [ ] Please feel free to join our Slack channel at https://sgl-fru7574.slack.com/archives/C09784E3EN6 to discuss your PR.

progress/SpecForge/assets/logo.svg ADDED Viewed

progress/SpecForge/benchmarks/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Benchmarking for Speculative Decoding
+## Overview
+We provided a unified script to test the performance of the Speculative Decoding with EAGLE3 algorithm on multiple datasets. You can follow the steps below to run the benchmarks.
+## Run Benchmarks
+### Launch SGLang and Benchmarker Concurrently
+`bench_eagle3.py` can help you launch a SGLang server process and a Benchmarking process concurrently. In this way, you don't have to launch the SGLang server manually, this script will manually handle the SGLang launch under different speculative decoding configurations. Some important arguments are:
+- `--model-path`: the path to the target model.
+- `--speculative-draft-model-path`: the path to the draft model.
+- `--port`: the port to launch the SGLang server.
+- `--trust-remote-code`: trust the remote code.
+- `--mem-fraction-static`: the memory fraction for the static memory.
+- `--tp-size`: the tensor parallelism size.
+- `--attention-backend`: the attention backend.
+- `--config-list`: the list of speculative decoding configuration to test, the format is `<batch-size>,<num-steps>,<topk>,<num-draft-tokens>`.
+- `--benchmark-list`: the list of benchmarks to test, the format is `<benchmark-name>:<num-prompts>:<subset>`.
+```shell
+python3 bench_eagle3.py \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --port 30000 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 1 \
+    --attention-backend fa3 \
+    --config-list 1,0,0,0 1,3,1,4 \
+    --benchmark-list mtbench gsm8k:5 ceval:5:accountant \
+    --dtype bfloat16
+```
+### Launch Benchmarker Independently
+If you want to launch the SGLang server independently, you can use the following command.
+```shell
+# you can launch a server
+python3 -m sglang.launch_server \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --mem-fraction-static 0.75 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --dtype bfloat16
+```
+Then we can start benchmarking. Note that you should use the same host and port as the one used in the SGLang server. Note that `--skip-launch-server` is required to skip the launch of the SGLang server.
+```bash
+python bench_eagle3.py \
+        --model-path meta-llama/Llama-3.1-8B-Instruct \
+        --port 30000 \
+        --config-list 1,3,1,4 \
+        --benchmark-list mtbench:5 ceval:5:accountant gsm8k:5 humaneval:5 math500:5 mtbench:5 aime:1 \
+        --skip-launch-server
+```

progress/SpecForge/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Benchmark scripts for speculative decoding evaluation.
+"""

progress/SpecForge/benchmarks/bench_eagle3.py ADDED Viewed

	@@ -0,0 +1,268 @@

+#!/usr/bin/env python3
+"""
+Usage:
+# if you want to run benchmarks directly
+# mtbench:20 means only run 20 samples in the dataset
+python bench_eagle3.py \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --port 30000 \
+    --config-list 1,0,0,0 1,3,1,4 \
+    --benchmark-list mtbench:20 \
+    --dtype bfloat16
+or if you want run sglang alone.
+# launch sglang
+python3 -m sglang.launch_server \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --mem-fraction-static 0.75 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --dtype bfloat16
+# then run benchmarks
+python bench_eagle3.py \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
+    --port 30000 \
+    --config-list 1,0,0,0 \
+    --benchmark-list mtbench:80 \
+    --dtype bfloat16 \
+    --skip-launch-server
+"""
+import argparse
+import json
+import os
+import time
+from dataclasses import asdict
+from typing import List
+import requests
+from benchmarker import BENCHMARKS
+from sglang.srt.server_args import ServerArgs
+from sglang.test.test_utils import kill_process_tree, popen_launch_server
+from sglang.utils import wait_for_server
+def parse_args():
+    parser = argparse.ArgumentParser()
+    sglang_group = parser.add_argument_group("sglang")
+    ServerArgs.add_cli_args(sglang_group)
+    # make the follow args a group
+    benchmark_group = parser.add_argument_group("benchmark")
+    benchmark_group.add_argument(
+        "--skip-launch-server", action="store_true", default=False
+    )
+    benchmark_group.add_argument("--timeout-for-server-launch", type=int, default=600)
+    benchmark_group.add_argument("--num-prompts", type=int, default=80)
+    benchmark_group.add_argument("--output-dir", type=str, default="./results")
+    benchmark_group.add_argument(
+        "--config-list", type=str, nargs="+", default=["1,0,0,0", "1,3,1,4"]
+    )
+    benchmark_group.add_argument(
+        "--name",
+        type=str,
+        default=None,
+        help="name of this benchmark run, if provided, will be added to the output file name",
+    )
+    benchmark_group.add_argument(
+        "--benchmark-list",
+        type=str,
+        nargs="+",
+        default=[
+            "mtbench:80",
+            "gsm8k:200",
+            "humaneval:200",
+            "math500:200",
+            "ceval:200",
+        ],
+        help=f"The list of benchmarks to run. The format is <benchmark-name>:<num-prompts>:<subset>,<subset>. We support the following benchmarks: {', '.join(BENCHMARKS.benchmarks.keys())}",
+    )
+    benchmark_group.add_argument(
+        "--enable-multi-turn-conversation",
+        action="store_true",
+        default=False,
+    )
+    return parser.parse_args()
+def launch_sglang_server(
+    server_args: ServerArgs,
+    base_url: str,
+    batch_size: int,
+    steps: int,
+    topk: int,
+    num_draft_tokens: int,
+    timeout: int,
+):
+    """
+    This function launches the SGLang server with the given server arguments.
+    """
+    sglang_args: List[str] = []
+    if steps > 0:
+        sglang_args.extend(
+            [
+                "--speculative-algorithm",
+                "EAGLE3",
+                "--speculative-num-steps",
+                str(steps),
+                "--speculative-eagle-topk",
+                str(topk),
+                "--speculative-num-draft-tokens",
+                str(num_draft_tokens),
+                "--speculative-draft-model-path",
+                server_args.speculative_draft_model_path,
+            ]
+        )
+    sglang_args.extend(
+        [
+            "--cuda-graph-max-bs",
+            str(batch_size),
+            "--mem-fraction-static",
+            str(server_args.mem_fraction_static),
+            "--tp-size",
+            str(server_args.tp_size),
+            "--max-running-requests",
+            str(batch_size),
+        ]
+    )
+    if server_args.trust_remote_code:
+        sglang_args.extend(["--trust-remote-code"])
+    if server_args.disable_radix_cache:
+        sglang_args.extend(["--disable-radix-cache"])
+    if server_args.ep_size:
+        sglang_args.extend(["--ep-size", str(server_args.ep_size)])
+    if server_args.attention_backend:
+        sglang_args.extend(["--attention-backend", server_args.attention_backend])
+    if server_args.quantization:
+        sglang_args.extend(["--quantization", server_args.quantization])
+    if server_args.dtype:
+        sglang_args.extend(["--dtype", server_args.dtype])
+    process = popen_launch_server(
+        server_args.model_path,
+        base_url,
+        timeout=timeout,
+        other_args=sglang_args,
+        env={
+            "SGLANG_RECORD_STEP_TIME": "1",
+            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN": "1",
+            **os.environ,
+        },
+    )
+    return process
+def send_flush_cache_request(base_url: str):
+    requests.post(base_url + "/flush_cache")
+def main():
+    args = parse_args()
+    server_args: ServerArgs = ServerArgs.from_cli_args(args)
+    configs = [tuple(map(int, config.split(","))) for config in args.config_list]
+    # split the arg into list of (bench_name, num_prompts)
+    benchmark_list = []
+    for item in args.benchmark_list:
+        splits = item.split(":")
+        if len(splits) == 1:
+            bench_name = splits[0]
+            num_prompts = None
+            subset = None
+        elif len(splits) == 2:
+            bench_name, num_prompts = splits
+            subset = None
+        elif len(splits) == 3:
+            bench_name, num_prompts, subset = splits
+            subset = subset.split(",")
+        else:
+            raise ValueError(f"Invalid benchmark list format: {item}")
+        benchmark_list.append((bench_name, num_prompts, subset))
+    assert len(benchmark_list) != 0, "the number of benchmark list is 0"
+    base_url = f"http://localhost:{args.port}"
+    results = {}
+    results["model"] = server_args.speculative_draft_model_path
+    def run_benchmarks(batch_size: int, steps: int, topk: int, num_draft_tokens: int):
+        for benchmark_name, num_prompts, subset in benchmark_list:
+            print(
+                f"Running benchmark {benchmark_name} with {num_prompts} prompts, batch size {batch_size}, steps {steps}, topk {topk}, num_draft_tokens {num_draft_tokens}, subset {subset}"
+            )
+            benchmarkder_cls = BENCHMARKS.get(benchmark_name)
+            num_prompts = int(num_prompts) if num_prompts is not None else None
+            if subset is None:
+                benchmarker = benchmarkder_cls(num_samples=num_prompts)
+            else:
+                benchmarker = benchmarkder_cls(num_samples=num_prompts, subset=subset)
+            metrics_list = benchmarker.run(
+                host=args.host, port=args.port, batch_size=batch_size
+            )
+            send_flush_cache_request(base_url)
+            if benchmark_name not in results:
+                results[benchmark_name] = []
+            results[benchmark_name].append(
+                dict(
+                    batch_size=batch_size,
+                    steps=steps,
+                    topk=topk,
+                    num_draft_tokens=num_draft_tokens,
+                    metrics=[asdict(metric) for metric in metrics_list],
+                    num_samples=num_prompts,
+                )
+            )
+    if args.skip_launch_server:
+        batch_size = configs[0][0] if len(configs) > 0 else 8
+        run_benchmarks(batch_size, None, None, None)
+    else:
+        # we itearate over each config from args
+        for batch_size, steps, topk, num_draft_tokens in configs:
+            process = launch_sglang_server(
+                server_args,
+                base_url,
+                batch_size,
+                steps,
+                topk,
+                num_draft_tokens,
+                args.timeout_for_server_launch,
+            )
+            wait_for_server(base_url)
+            run_benchmarks(batch_size, steps, topk, num_draft_tokens)
+            kill_process_tree(process.pid)
+            process.wait()
+    os.makedirs(args.output_dir, exist_ok=True)
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    result_file = os.path.join(
+        args.output_dir,
+        f"{args.name + '_' if args.name else ''}results_{timestamp}.jsonl",
+    )
+    with open(result_file, "w") as f:
+        json.dump(results, f, indent=4)
+    print(f"Results saved to {result_file}")
+if __name__ == "__main__":
+    main()

progress/SpecForge/benchmarks/benchmarker/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from .aime import AIMEBenchmarker
+from .ceval import CEvalBenchmarker
+from .financeqa import FinanceQABenchmarker
+from .gpqa import GPQABenchmarker
+from .gsm8k import GSM8KBenchmarker
+from .humaneval import HumanEvalBenchmarker
+from .livecodebench import LCBBenchmarker
+from .math500 import Math500Benchmarker
+from .mmlu import MMLUBenchmarker
+from .mmstar import MMStarBenchmarker
+from .mtbench import MTBenchBenchmarker
+from .registry import BENCHMARKS
+from .simpleqa import SimpleQABenchmarker
+__all__ = [
+    "BENCHMARKS",
+    "AIMEBenchmarker",
+    "CEvalBenchmarker",
+    "GSM8KBenchmarker",
+    "HumanEvalBenchmarker",
+    "Math500Benchmarker",
+    "MTBenchBenchmarker",
+    "MMStarBenchmarker",
+    "GPQABenchmarker",
+    "FinanceQABenchmarker",
+    "MMLUBenchmarker",
+    "LCBBenchmarker",
+    "SimpleQABenchmarker",
+]

progress/SpecForge/benchmarks/benchmarker/aime.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+AIME benchmark
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_aime_answer(output: str) -> Optional[str]:
+    """Extract final answer from AIME problem solution.
+    AIME answers are typically integers between 0 and 999, and are usually
+    in \boxed{} format.
+    """
+    # Try to find answer in \boxed{} format
+    boxed_pattern = r"\\boxed\{([^}]+)\}"
+    match = re.search(boxed_pattern, output)
+    if match:
+        answer = match.group(1).strip()
+        # Extract number from the boxed content
+        numbers = re.findall(r"\d+", answer)
+        if numbers:
+            return numbers[-1]  # Take the last number (usually the final answer)
+        return answer
+    # Try to find answer in \boxed format (without braces)
+    boxed_pattern2 = r"\\boxed\s+(\d+)"
+    match = re.search(boxed_pattern2, output)
+    if match:
+        return match.group(1).strip()
+    # Look for patterns like "The answer is 42" or "Answer: 123"
+    answer_patterns = [
+        r"(?:answer|Answer|ANSWER)[\s:]+(\d+)",
+        r"(?:final\s+answer|Final\s+Answer)[\s:]+(\d+)",
+        r"(?:is|equals?|=\s*)(\d+)\s*$",
+    ]
+    for pattern in answer_patterns:
+        matches = re.findall(pattern, output, re.IGNORECASE)
+        if matches:
+            return matches[-1].strip()
+    # Fallback: extract the last integer in the text
+    numbers = re.findall(r"\b(\d+)\b", output)
+    if numbers:
+        # Filter to reasonable AIME answer range (0-999)
+        valid_numbers = [n for n in numbers if 0 <= int(n) <= 999]
+        if valid_numbers:
+            return valid_numbers[-1]
+    return None
+@BENCHMARKS.register("aime")
+class AIMEBenchmarker(Benchmarker):
+    """AIME benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]:
+        """Load and preprocess AIME dataset."""
+        dataset = load_dataset("Maxwell-Jia/AIME_2024")["train"]
+        questions = []
+        labels = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            questions.append({"question": q["Problem"]})
+            # Extract answer from Answer field
+            answer = None
+            if "Answer" in q:
+                answer = str(q["Answer"]).strip()
+            elif "answer" in q:
+                answer = str(q["answer"]).strip()
+            labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract answer from model output."""
+        return extract_aime_answer(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for AIME by comparing numeric answers."""
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for pred, label in zip(predictions, labels):
+            if label is not None:
+                valid_count += 1
+                if pred is not None:
+                    # Normalize answers for comparison
+                    pred_normalized = str(pred).strip()
+                    label_normalized = str(label).strip()
+                    # Try exact match first
+                    if pred_normalized == label_normalized:
+                        correct += 1
+                    else:
+                        # Try numeric comparison
+                        try:
+                            pred_num = int(pred_normalized)
+                            label_num = int(label_normalized)
+                            if pred_num == label_num:
+                                correct += 1
+                        except ValueError:
+                            pass
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for AIME with reasoning prompt."""
+        return create_simple_sgl_function(
+            function_name="reasoning_gen",
+            answer_key="answer",
+            user_prefix="\nPlease reason step by step, and put your final answer within \\boxed{}.",
+        )
+    def get_max_new_tokens(self) -> int:
+        """AIME problems require more tokens."""
+        return 32768

progress/SpecForge/benchmarks/benchmarker/base.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+Base class for benchmark implementations.
+"""
+import time
+from abc import ABC, abstractmethod
+from argparse import Namespace
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from sglang import set_default_backend
+from sglang.test.test_utils import select_sglang_backend
+from .utils import compute_metrics
+class Benchmarker(ABC):
+    """
+    Base class for benchmark implementations.
+    Subclasses should implement:
+    - load_data(): Load and preprocess dataset
+    - create_sgl_function(): Create the SGL function for inference
+    Optional overrides:
+    - extract_answer(): Extract answer from model output (if needed)
+    - compute_accuracy(): Compute accuracy metric (if applicable)
+    - get_answer_keys(): Get list of answer keys for multi-turn conversations
+    Args:
+        num_samples: The number of samples to run the benchmark on. If not provided, all questions will be used.
+        subset: The subset of the dataset to run the benchmark on. If not provided, all subsets will be used.
+    """
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        self.num_samples = num_samples
+        self.subset = subset
+    @abstractmethod
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Any]]:
+        """
+        Load and preprocess the dataset.
+        Returns:
+            Tuple of (questions, labels) where:
+            - questions: List of question dicts for SGL function
+            - labels: List of ground truth labels (can be None if not applicable)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def create_sgl_function(self) -> Callable:
+        """
+        Create the SGL function for inference.
+        Returns:
+            SGL function decorated with @sgl.function
+        """
+        raise NotImplementedError
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[Any]:
+        """
+        Extract answer from model output.
+        Args:
+            output: Raw model output string
+            label: Optional ground truth label for reference
+        Returns:
+            Extracted answer, or None if extraction fails
+        """
+        return output
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """
+        Compute accuracy metric.
+        Args:
+            predictions: List of predicted answers
+            labels: List of ground truth labels
+        Returns:
+            Accuracy score (0-1), or None if not applicable
+        """
+        return None
+    def get_answer_keys(self) -> Optional[List[str]]:
+        """
+        Get list of answer keys for multi-turn conversations.
+        Returns:
+            List of answer keys (e.g., ["answer_1", "answer_2"]), or None for single-turn
+        """
+        return None
+    def get_max_new_tokens(self) -> int:
+        """
+        Get maximum number of new tokens to generate.
+        Returns:
+            Maximum tokens (default: 2048)
+        """
+        return 2048
+    def run(
+        self,
+        host: str,
+        port: int,
+        batch_size: int,
+        max_new_tokens: int = None,
+        num_runs: int = 1,
+    ):
+        """
+        Run the benchmark evaluation.
+        This method handles the common workflow:
+        1. Initialize backend
+        2. Load data
+        3. Create SGL function
+        4. Run inference loops
+        5. Compute metrics
+        6. Print results
+        Args:
+            host (str): The host of the SGLang server
+            port (int): The port of the SGLang server
+            batch_size (int): The number of prompts to process in parallel
+            num_samples (int): The number of samples to run the benchmark on. If not provided, all samples will be used.
+            max_new_tokens (int): Maximum number of new tokens to generate, default is 2048
+            num_runs (int): The number of times to run this benchmark, default is 1. You can set it to a larger number if you want to get more stable results.
+        """
+        if not host.startswith(("http://", "https://")):
+            host = f"http://{host}"
+        # Initialize backend
+        sglang_args = Namespace(host=host, port=port, backend="srt-no-parallel")
+        set_default_backend(select_sglang_backend(sglang_args))
+        # Load data
+        questions, labels = self.load_data()
+        if len(questions) == 0:
+            print("No valid questions found. Please check the dataset format.")
+            return
+        # Create SGL function
+        sgl_function = self.create_sgl_function()
+        # Run evaluation loops
+        metrics_list = []
+        answer_keys = self.get_answer_keys()
+        max_new_tokens = max_new_tokens or self.get_max_new_tokens()
+        for _ in range(num_runs):
+            tic = time.perf_counter()
+            states = sgl_function.run_batch(
+                questions,
+                temperature=0,
+                max_new_tokens=max_new_tokens,
+                num_threads=batch_size,
+                progress_bar=True,
+            )
+            latency = time.perf_counter() - tic
+            # Extract predictions
+            predictions = []
+            primary_answer_key = answer_keys[0] if answer_keys else "answer"
+            for i in range(len(states)):
+                # Access answer from state object (states[i] supports dict-like access)
+                output = states[i][primary_answer_key]
+                if isinstance(output, str):
+                    extracted = self.extract_answer(
+                        output,
+                        (labels[i] if labels and i < len(labels) else None),
+                    )
+                else:
+                    extracted = output
+                predictions.append(extracted)
+            # Compute accuracy if applicable
+            accuracy = None
+            # Check if we have a labels list (even if all labels are None)
+            has_labels_list = labels and len(labels) > 0
+            if has_labels_list:
+                # Always call compute_accuracy if we have a labels list
+                # This allows it to return None, which will be displayed in print_results
+                accuracy = self.compute_accuracy(predictions, labels)
+                if accuracy is not None:
+                    valid_count = sum(1 for p in predictions if p is not None)
+                    if valid_count < len(predictions):
+                        print(
+                            f"Warning: {len(predictions) - valid_count} predictions could not be extracted."
+                        )
+            # Compute performance metrics
+            metrics = compute_metrics(
+                states,
+                latency,
+                answer_key=primary_answer_key,
+                additional_answer_keys=(
+                    answer_keys[1:] if answer_keys and len(answer_keys) > 1 else None
+                ),
+            )
+            # Always set accuracy if we have a labels list (even if compute_accuracy returns None)
+            # This allows print_results to show None when compute_accuracy returns None
+            if has_labels_list:
+                metrics.accuracy = (
+                    accuracy  # Can be None if compute_accuracy returns None
+                )
+                if accuracy is not None:
+                    metrics.num_valid_predictions = sum(
+                        1 for p in predictions if p is not None
+                    )
+            metrics_list.append(metrics)
+        return metrics_list

progress/SpecForge/benchmarks/benchmarker/ceval.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+C-Eval benchmark evaluation script.
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import concatenate_datasets, load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_answer(answer_str: str) -> str:
+    """Extract the answer choice (A, B, C, D) from the model output."""
+    # Try to find the answer in various formats
+    answer_str = answer_str.strip().upper()
+    # Direct match for single letter
+    match = re.search(r"\b([ABCD])\b", answer_str)
+    if match:
+        return match.group(1)
+    # Try to find answer in parentheses or brackets
+    for pattern in [
+        r"\(([ABCD])\)",
+        r"\[([ABCD])\]",
+        r"答案[：:]\s*([ABCD])",
+        r"Answer[：:]\s*([ABCD])",
+    ]:
+        match = re.search(pattern, answer_str, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+    # Try to find the first occurrence of A, B, C, or D
+    match = re.search(r"([ABCD])", answer_str)
+    if match:
+        return match.group(1)
+    return None
+def format_question(question: str, options: List[str]) -> str:
+    """Format the question with options."""
+    prompt = question + "\n\n选项：\n"
+    for i, option in enumerate(options):
+        prompt += f"{chr(65 + i)}. {option}\n"
+    prompt += "\n请从A、B、C、D中选择一个答案。"
+    return prompt
+@BENCHMARKS.register("ceval")
+class CEvalBenchmarker(Benchmarker):
+    """C-Eval benchmark implementation."""
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        if subset is None:
+            subset = "all"
+        super().__init__(num_samples, subset)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[str]]:
+        """Load and preprocess C-Eval dataset."""
+        all_configs = [
+            "accountant",
+            "advanced_mathematics",
+            "art_studies",
+            "basic_medicine",
+            "business_administration",
+            "chinese_language_and_literature",
+            "civil_servant",
+            "clinical_medicine",
+            "college_chemistry",
+            "college_economics",
+            "college_physics",
+            "college_programming",
+            "computer_architecture",
+            "computer_network",
+            "discrete_mathematics",
+            "education_science",
+            "electrical_engineer",
+            "environmental_impact_assessment_engineer",
+            "fire_engineer",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_chinese",
+            "high_school_geography",
+            "high_school_history",
+            "high_school_mathematics",
+            "high_school_physics",
+            "high_school_politics",
+            "ideological_and_moral_cultivation",
+            "law",
+            "legal_professional",
+            "logic",
+            "mao_zedong_thought",
+            "marxism",
+            "metrology_engineer",
+            "middle_school_biology",
+            "middle_school_chemistry",
+            "middle_school_geography",
+            "middle_school_history",
+            "middle_school_mathematics",
+            "middle_school_physics",
+            "middle_school_politics",
+            "modern_chinese_history",
+            "operating_system",
+            "physician",
+            "plant_protection",
+            "probability_and_statistics",
+            "professional_tour_guide",
+            "sports_science",
+            "tax_accountant",
+            "teacher_qualification",
+            "urban_and_rural_planner",
+            "veterinary_medicine",
+        ]
+        # Select configs to load
+        if self.subset == "all":
+            configs_to_load = all_configs
+        else:
+            for subset in self.subset:
+                assert (
+                    subset in all_configs
+                ), f"Subset {subset} not found in C-Eval dataset"
+            configs_to_load = self.subset
+        # Load datasets
+        try:
+            datasets = []
+            for config in configs_to_load:
+                try:
+                    ds = load_dataset("ceval/ceval-exam", name=config, split="test")
+                    datasets.append(ds)
+                    print(f"Loaded config '{config}' with {len(ds)} samples")
+                except Exception as e:
+                    print(f"Warning: Failed to load config '{config}': {e}")
+            if len(datasets) == 0:
+                raise ValueError("No configs could be loaded")
+            dataset = concatenate_datasets(datasets)
+            print(
+                f"Successfully loaded C-Eval dataset with all configs (total: {len(dataset)} samples)"
+            )
+        except Exception as e:
+            print(e)
+            print(f"Failed to load C-Eval dataset from 'ceval/ceval-exam': {e}")
+            print("Please ensure the dataset is available or install it manually.")
+            print("You can try: pip install datasets")
+            print("Or download from: https://huggingface.co/datasets/ceval/ceval-exam")
+            return [], []
+        # Process questions
+        questions = []
+        labels = []
+        for idx, item in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            # Handle different dataset formats
+            question_text = None
+            if "question" in item:
+                question_text = item["question"]
+            elif "inputs" in item:
+                question_text = item["inputs"]
+            elif "problem" in item:
+                question_text = item["problem"]
+            elif "content" in item:
+                question_text = item["content"]
+            if not question_text:
+                continue
+            # Get options - C-Eval typically has options as a list or dict
+            options = None
+            if "options" in item:
+                options = item["options"]
+                if isinstance(options, dict):
+                    # Convert dict to list in order A, B, C, D
+                    options = [
+                        options.get("A", ""),
+                        options.get("B", ""),
+                        options.get("C", ""),
+                        options.get("D", ""),
+                    ]
+                elif isinstance(options, list):
+                    # Ensure we have 4 options
+                    while len(options) < 4:
+                        options.append("")
+            elif "choices" in item:
+                options = item["choices"]
+                if isinstance(options, dict):
+                    options = [
+                        options.get("A", ""),
+                        options.get("B", ""),
+                        options.get("C", ""),
+                        options.get("D", ""),
+                    ]
+            else:
+                # Try to construct options from A, B, C, D fields
+                options = [
+                    item.get("A", item.get("option_A", "")),
+                    item.get("B", item.get("option_B", "")),
+                    item.get("C", item.get("option_C", "")),
+                    item.get("D", item.get("option_D", "")),
+                ]
+            # Filter out empty options
+            if options:
+                options = [str(opt).strip() for opt in options if opt]
+                if len(options) < 2:  # Need at least 2 options
+                    continue
+            else:
+                continue
+            # Get answer
+            answer = None
+            if "answer" in item:
+                answer = str(item["answer"]).upper().strip()
+            elif "target" in item:
+                answer = str(item["target"]).upper().strip()
+            elif "label" in item:
+                answer = str(item["label"]).upper().strip()
+            elif "correct" in item:
+                answer = str(item["correct"]).upper().strip()
+            # Validate answer
+            if answer and answer in ["A", "B", "C", "D"]:
+                # Format question
+                formatted_question = format_question(question_text, options)
+                questions.append({"question": formatted_question})
+                labels.append(answer)
+        if len(questions) == 0:
+            print("No valid questions found. Please check the dataset format.")
+            print(
+                "Sample item keys:",
+                list(dataset[0].keys()) if len(dataset) > 0 else "No items",
+            )
+            return [], []
+        return questions, labels
+    def create_sgl_function(self):
+        """Create SGL function for C-Eval."""
+        return create_simple_sgl_function(
+            function_name="get_ceval_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def extract_answer(self, output: str, label: Any = None) -> str:
+        """Extract answer choice from model output."""
+        return extract_answer(output)
+    def compute_accuracy(self, predictions: List[str], labels: List[str]) -> float:
+        """Compute accuracy metric."""
+        correct = 0
+        valid_count = 0
+        for i in range(len(predictions)):
+            if predictions[i] is not None:  # Only count valid predictions
+                valid_count += 1
+                if predictions[i] == labels[i]:
+                    correct += 1
+        return correct / valid_count if valid_count > 0 else 0.0

progress/SpecForge/benchmarks/benchmarker/financeqa.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+QUESTION_PROMPT = """
+Given the following context:
+{context}
+Can you answer the following question?
+{question}
+""".strip()
+def generate_question(row: Dict[str, Any]) -> str:
+    if row["context"] is None:
+        return row["question"].strip()
+    else:
+        question = QUESTION_PROMPT.format(
+            context=row["context"].strip(),
+            question=row["question"].strip(),
+        )
+        return question
+@BENCHMARKS.register("financeqa")
+class FinanceQABenchmarker(Benchmarker):
+    """FinanceQA benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        """Load and preprocess FinanceQA dataset."""
+        # Read data
+        ds = load_dataset("AfterQuery/FinanceQA")["test"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(None)
+        return questions, labels
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_financeqa_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

progress/SpecForge/benchmarks/benchmarker/gpqa.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import random
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+GPQA_QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{Question}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+def generate_question(row: Dict[str, Any]) -> str:
+    gold_index = random.randint(0, 3)
+    choices = [
+        row["Incorrect Answer 1"],
+        row["Incorrect Answer 2"],
+        row["Incorrect Answer 3"],
+    ]
+    choices.insert(gold_index, row["Correct Answer"])
+    question = GPQA_QUERY_TEMPLATE.format(
+        Question=row["Question"].strip(),
+        A=choices[0].strip(),
+        B=choices[1].strip(),
+        C=choices[2].strip(),
+        D=choices[3].strip(),
+    )
+    # 0 means A, 1 means B, 2 means C, 3 means D
+    answer = ["A", "B", "C", "D"][gold_index]
+    return question, answer
+@BENCHMARKS.register("gpqa")
+class GPQABenchmarker(Benchmarker):
+    """GPQA benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        """Load and preprocess GPQA dataset."""
+        # Read data
+        ds = load_dataset("Idavidrein/gpqa", "gpqa_main")["train"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text, answer = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]:
+        if "Answer: " not in output:
+            return None
+        return output.split("Answer: ")[1].strip()
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        if not labels or len(labels) == 0:
+            return None
+        correct = sum(1 for pred, label in zip(predictions, labels) if pred == label)
+        return correct / len(labels) if len(labels) > 0 else 0.0
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_gpqa_mcq_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

progress/SpecForge/benchmarks/benchmarker/gsm8k.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+GSM8K benchmark evaluation script.
+"""
+import ast
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from sglang.utils import download_and_cache_file, read_jsonl
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_few_shot_sgl_function
+INVALID = -9999999
+def get_one_example(lines: List[Dict], i: int, include_answer: bool) -> str:
+    """Format a single example."""
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+def get_few_shot_examples(lines: List[Dict], k: int) -> str:
+    """Get few-shot examples as a string."""
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def get_answer_value(answer_str: str) -> int:
+    """Extract numeric answer from model output."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+@BENCHMARKS.register("gsm8k")
+class GSM8KBenchmarker(Benchmarker):
+    """GSM8K benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        """Load and preprocess GSM8K dataset."""
+        # Read data
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        data_path = download_and_cache_file(url)
+        lines = list(read_jsonl(data_path))
+        # Construct prompts
+        few_shot_examples = get_few_shot_examples(lines, 5)
+        questions = []
+        labels = []
+        for i in range((len(lines))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = get_one_example(lines, i, False)
+            questions.append({"question": question_text})
+            labels.append(get_answer_value(lines[i]["answer"]))
+        # Store few_shot_examples for use in create_sgl_function
+        self.few_shot_examples = few_shot_examples
+        assert all(l != INVALID for l in labels), "Some labels are invalid"
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]:
+        """Extract numeric answer from model output."""
+        return get_answer_value(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for GSM8K by comparing numeric answers."""
+        if not labels or len(labels) == 0:
+            return None
+        correct = sum(1 for pred, label in zip(predictions, labels) if pred == label)
+        return correct / len(labels) if len(labels) > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for GSM8K with few-shot examples."""
+        return create_few_shot_sgl_function(
+            few_shot_examples=self.few_shot_examples,
+            function_name="few_shot_gsm8k",
+            answer_key="answer",
+            stop=["Question", "Assistant:", "<|separator|>"],
+        )

progress/SpecForge/benchmarks/benchmarker/humaneval.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+HumanEval benchmark evaluation script.
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_code_from_output(output: str) -> Optional[str]:
+    """Extract Python code from model output.
+    Tries to extract code blocks or function definitions.
+    """
+    # Try to find code in markdown code blocks
+    code_block_pattern = r"```(?:python)?\n(.*?)```"
+    match = re.search(code_block_pattern, output, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    # Try to find function definition (common in HumanEval)
+    # Look for "def " followed by code until the next def or end of string
+    def_pattern = r"(def\s+\w+\([^)]*\):.*?)(?=\n\ndef\s+|\Z)"
+    match = re.search(def_pattern, output, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    # Fallback: return the output as-is (might already be code)
+    return output.strip() if output.strip() else None
+def check_code_passes_tests(code: str, test_code: str, entry_point: str) -> bool:
+    """Check if generated code passes the test cases.
+    This is a simplified version. For full evaluation, use the official
+    HumanEval evaluation framework.
+    HumanEval test code typically contains assertions that will raise
+    AssertionError if the code doesn't pass. If execution completes without
+    exceptions, the tests pass.
+    """
+    try:
+        # Create a safe execution environment
+        namespace = {}
+        # Execute the code (function definition)
+        exec(code, namespace)
+        # Execute the test code (which contains assertions)
+        # If no exception is raised, the tests pass
+        exec(test_code, namespace)
+        return True
+    except AssertionError:
+        # Assertion failed - test didn't pass
+        return False
+    except Exception:
+        # Any other exception (syntax error, runtime error, etc.) means test failed
+        return False
+@BENCHMARKS.register("humaneval")
+class HumanEvalBenchmarker(Benchmarker):
+    """HumanEval benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        """Initialize benchmark and store test cases."""
+        super().__init__(num_samples, None)
+        self.test_cases = []
+        self.entry_points = []
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[Dict[str, str]]]]:
+        """Load and preprocess HumanEval dataset."""
+        dataset = load_dataset("openai/openai_humaneval")["test"]
+        questions = []
+        labels = []
+        self.test_cases = []
+        self.entry_points = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            questions.append({"question": q["prompt"]})
+            # Store test case and entry point for evaluation
+            test_code = q.get("test", "")
+            entry_point = q.get("entry_point", "")
+            self.test_cases.append(test_code)
+            self.entry_points.append(entry_point)
+            # Store canonical solution as reference (optional, for comparison)
+            canonical_solution = q.get("canonical_solution", "")
+            labels.append(
+                {
+                    "test": test_code,
+                    "entry_point": entry_point,
+                    "canonical_solution": canonical_solution,
+                }
+            )
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract code from model output."""
+        return extract_code_from_output(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for HumanEval by checking if code passes tests.
+        Note: This is a simplified evaluation. For official pass@k metrics,
+        use the HumanEval evaluation framework.
+        """
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for i, (pred, label) in enumerate(zip(predictions, labels)):
+            if label is not None and isinstance(label, dict):
+                valid_count += 1
+                if pred is not None:
+                    try:
+                        # Get the prompt (function signature and docstring)
+                        prompt = self.questions[i]["question"]
+                        entry_point = label.get("entry_point", "")
+                        # The prompt contains the function signature (e.g., "def function_name(...):")
+                        # The generated code might be:
+                        # 1. Just the function body (what we want) - need to combine with prompt
+                        # 2. The complete function including signature - use as-is
+                        # 3. Code in markdown blocks - already extracted by extract_code_from_output
+                        pred_str = str(pred).strip()
+                        # Check if pred already contains a complete function definition
+                        # (starts with "def " and contains the entry_point function name)
+                        if pred_str.startswith("def ") and entry_point:
+                            # Check if this is the same function (by name)
+                            func_name_match = re.match(r"def\s+(\w+)\s*\(", pred_str)
+                            if (
+                                func_name_match
+                                and func_name_match.group(1) == entry_point
+                            ):
+                                # Generated code includes complete function, use it as-is
+                                full_code = pred_str
+                            else:
+                                # Different function or no match, combine with prompt
+                                full_code = prompt + "\n" + pred_str
+                        elif pred_str.startswith("def "):
+                            # Has function definition but we can't verify entry_point, use as-is
+                            full_code = pred_str
+                        else:
+                            # Generated code is just the body, combine with prompt
+                            full_code = prompt + "\n" + pred_str
+                        # Check if code passes tests
+                        test_code = label.get("test", "")
+                        if test_code and check_code_passes_tests(
+                            full_code, test_code, entry_point
+                        ):
+                            correct += 1
+                    except Exception as e:
+                        # If evaluation fails, consider it incorrect
+                        # Uncomment for debugging: print(f"Error evaluating code {i}: {e}")
+                        pass
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for HumanEval."""
+        return create_simple_sgl_function(
+            function_name="get_humaneval_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def get_max_new_tokens(self) -> int:
+        """HumanEval code generation requires more tokens."""
+        return 1024

progress/SpecForge/benchmarks/benchmarker/livecodebench.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+GSM8K benchmark evaluation script.
+"""
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def generate_question(row: Dict[str, Any]) -> str:
+    question = row["question_content"].strip()
+    return question
+@BENCHMARKS.register("livecodebench")
+class LCBBenchmarker(Benchmarker):
+    """LiveCodeBench benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        # Read data
+        ds = load_dataset("livecodebench/code_generation")["test"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(None)
+        return questions, labels
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_livecodebench_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

progress/SpecForge/benchmarks/benchmarker/math500.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+MATH-500 benchmark evaluation script.
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_math_answer(output: str) -> Optional[str]:
+    """Extract final answer from math problem solution.
+    Tries to extract answer from \boxed{} format first, then looks for
+    the last number in the output.
+    """
+    # Try to find answer in \boxed{} format
+    boxed_pattern = r"\\boxed\{([^}]+)\}"
+    match = re.search(boxed_pattern, output)
+    if match:
+        return match.group(1).strip()
+    # Try to find answer in \boxed format (without braces)
+    boxed_pattern2 = r"\\boxed\s+([^\s]+)"
+    match = re.search(boxed_pattern2, output)
+    if match:
+        return match.group(1).strip()
+    # Try to find the last number (could be integer or decimal)
+    # Look for patterns like "The answer is 42" or "Answer: 3.14"
+    answer_patterns = [
+        r"(?:answer|Answer|ANSWER)[\s:]+([-+]?\d*\.?\d+)",
+        r"(?:is|equals?|=\s*)([-+]?\d*\.?\d+)\s*$",
+    ]
+    for pattern in answer_patterns:
+        matches = re.findall(pattern, output, re.IGNORECASE)
+        if matches:
+            return matches[-1].strip()
+    # Fallback: extract the last number in the text
+    numbers = re.findall(r"[-+]?\d*\.?\d+", output)
+    if numbers:
+        return numbers[-1]
+    return None
+@BENCHMARKS.register("math500")
+class Math500Benchmarker(Benchmarker):
+    """MATH-500 benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]:
+        """Load and preprocess MATH-500 dataset."""
+        dataset = load_dataset("HuggingFaceH4/MATH-500")["test"]
+        questions = []
+        labels = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            questions.append({"question": q["problem"]})
+            # Extract answer from solution or answer field
+            answer = None
+            if "answer" in q:
+                answer = str(q["answer"]).strip()
+            elif "solution" in q:
+                # Try to extract from solution
+                answer = extract_math_answer(q["solution"])
+            labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract answer from model output."""
+        return extract_math_answer(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for MATH-500 by comparing answers."""
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for pred, label in zip(predictions, labels):
+            if label is not None:
+                valid_count += 1
+                if pred is not None:
+                    # Normalize answers for comparison (remove whitespace, handle different formats)
+                    pred_normalized = str(pred).strip().lower()
+                    label_normalized = str(label).strip().lower()
+                    # Try exact match first
+                    if pred_normalized == label_normalized:
+                        correct += 1
+                    else:
+                        # Try numeric comparison if both are numbers
+                        try:
+                            pred_num = float(pred_normalized)
+                            label_num = float(label_normalized)
+                            if abs(pred_num - label_num) < 1e-6:
+                                correct += 1
+                        except ValueError:
+                            pass
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for MATH-500."""
+        return create_simple_sgl_function(
+            function_name="get_math500_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

progress/SpecForge/benchmarks/benchmarker/mmlu.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+GPQA_QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{Question}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+def generate_question(row: Dict[str, Any]) -> str:
+    choices = row["choices"]
+    question = GPQA_QUERY_TEMPLATE.format(
+        Question=row["question"].strip(),
+        A=choices[0].strip(),
+        B=choices[1].strip(),
+        C=choices[2].strip(),
+        D=choices[3].strip(),
+    )
+    # 0 means A, 1 means B, 2 means C, 3 means D
+    answer = ["A", "B", "C", "D"][row["answer"]]
+    print(answer)
+    return question, answer
+@BENCHMARKS.register("mmlu")
+class MMLUBenchmarker(Benchmarker):
+    """MMLU benchmark implementation."""
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        if subset is None:
+            subset = ["all"]
+        super().__init__(num_samples, subset)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        # Read data
+        questions = []
+        labels = []
+        for subset in self.subset:
+            ds = load_dataset("cais/mmlu", subset)["test"]
+            for i in range((len(ds))):
+                if self.num_samples is not None and i >= self.num_samples:
+                    break
+                question_text, answer = generate_question(ds[i])
+                questions.append({"question": question_text})
+                labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]:
+        if "Answer: " not in output:
+            return None
+        return output.split("Answer: ")[1].strip()
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        if not labels or len(labels) == 0:
+            return None
+        correct = sum(1 for pred, label in zip(predictions, labels) if pred == label)
+        return correct / len(labels) if len(labels) > 0 else 0.0
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_mmlu_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

progress/SpecForge/benchmarks/benchmarker/mmstar.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+MMStar benchmark evaluation script.
+"""
+import os
+import re
+import shutil
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_image_sgl_function
+def extract_mmstar_answer(
+    output: str, options: Optional[List[str]] = None
+) -> Optional[str]:
+    """Extract answer from MMStar model output.
+    MMStar questions typically have multiple choice options (A, B, C, D, etc.)
+    """
+    output_upper = output.strip().upper()
+    # Try to find answer choice (A, B, C, D, etc.)
+    # Direct match for single letter
+    match = re.search(r"\b([A-Z])\b", output_upper)
+    if match:
+        letter = match.group(1)
+        if options and len(options) > 0:
+            # Validate that the letter is within valid range
+            max_option = chr(64 + len(options))  # 'A' + (len-1)
+            if "A" <= letter <= max_option:
+                return letter
+        else:
+            # Assume A-D are valid
+            if "A" <= letter <= "D":
+                return letter
+    # Try to find answer in parentheses or brackets
+    for pattern in [
+        r"\(([A-Z])\)",
+        r"\[([A-Z])\]",
+        r"答案[：:]\s*([A-Z])",
+        r"Answer[：:]\s*([A-Z])",
+        r"选择[：:]\s*([A-Z])",
+    ]:
+        match = re.search(pattern, output_upper)
+        if match:
+            letter = match.group(1)
+            if options and len(options) > 0:
+                max_option = chr(64 + len(options))
+                if "A" <= letter <= max_option:
+                    return letter
+            elif "A" <= letter <= "D":
+                return letter
+    return None
+@BENCHMARKS.register("mmstar")
+class MMStarBenchmarker(Benchmarker):
+    """MMStar benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+        """Initialize benchmark and set up cache directory."""
+        self.cache_dir = None
+        self.options_list = []  # Store options for each question
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]:
+        """Load and preprocess MMStar dataset."""
+        self.cache_dir = os.path.join(".cache", "mmstar_specforge")
+        image_dir = os.path.join(self.cache_dir, "images")
+        os.makedirs(self.cache_dir, exist_ok=True)
+        os.makedirs(image_dir, exist_ok=True)
+        print(f"Created temporary image directory: {self.cache_dir}")
+        dataset = load_dataset("Lin-Chen/MMStar")["val"]
+        questions = []
+        labels = []
+        self.options_list = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            image = q["image"]
+            image_path = os.path.join(self.cache_dir, q["meta_info"]["image_path"])
+            image.convert("RGB").save(image_path, "JPEG")
+            # Extract question and options
+            question_full = q["question"]
+            if "Options:" in question_full:
+                question_text, options_text = question_full.split("Options:", 1)
+                question_text = question_text.strip()
+                # Parse options (typically A. option1 B. option2 etc.)
+                options = []
+                for line in options_text.strip().split("\n"):
+                    line = line.strip()
+                    if line and re.match(r"^[A-Z]\.", line):
+                        option_text = re.sub(r"^[A-Z]\.\s*", "", line).strip()
+                        options.append(option_text)
+                self.options_list.append(options)
+            else:
+                question_text = question_full.strip()
+                self.options_list.append([])
+            item = {
+                "image_path": image_path,
+                "question": question_text,
+            }
+            questions.append(item)
+            # Extract ground truth answer
+            answer = None
+            if "answer" in q:
+                answer = str(q["answer"]).strip().upper()
+            elif "correct_answer" in q:
+                answer = str(q["correct_answer"]).strip().upper()
+            elif "ground_truth" in q:
+                answer = str(q["ground_truth"]).strip().upper()
+            # Validate answer is a valid option letter
+            if answer and len(answer) == 1 and "A" <= answer <= "Z":
+                if self.options_list[-1]:
+                    max_option = chr(64 + len(self.options_list[-1]))
+                    if answer <= max_option:
+                        labels.append(answer)
+                    else:
+                        labels.append(None)
+                else:
+                    labels.append(answer)
+            else:
+                labels.append(None)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract answer from model output."""
+        # Use the options for the current question if available
+        # Note: We can't easily get the question index here, so we'll use a simpler approach
+        return extract_mmstar_answer(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for MMStar by comparing answer choices."""
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for pred, label in zip(predictions, labels):
+            if label is not None:
+                valid_count += 1
+                if pred is not None:
+                    # Normalize to uppercase for comparison
+                    pred_normalized = str(pred).strip().upper()
+                    label_normalized = str(label).strip().upper()
+                    if pred_normalized == label_normalized:
+                        correct += 1
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for MMStar (image-based Q&A)."""
+        return create_image_sgl_function(
+            function_name="get_mmstar_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def run(self, *args, **kwargs):
+        """Run benchmark and clean up cache directory."""
+        try:
+            return super().run(*args, **kwargs)
+        finally:
+            # Clean up cache directory
+            if self.cache_dir and os.path.exists(self.cache_dir):
+                shutil.rmtree(self.cache_dir)
+                print(f"Deleted temporary directory: {self.cache_dir}")

progress/SpecForge/benchmarks/benchmarker/mtbench.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+MT-Bench benchmark evaluation script.
+Adapted from https://github.com/chromecast56/sglang/blob/6f145d2eadb93a116134f703358ce76f15381045/benchmark/mtbench/bench_sglang.py
+"""
+from typing import Any, Dict, List, Optional, Tuple
+from sglang.utils import download_and_cache_file, read_jsonl
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_multi_turn_sgl_function
+SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
+@BENCHMARKS.register("mtbench")
+class MTBenchBenchmarker(Benchmarker):
+    """MT-Bench benchmark implementation."""
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        # support categorical data for mtbench
+        if subset is None:
+            subset = ["all"]
+        super().__init__(num_samples, subset)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[None]]:
+        """Load and preprocess MT-Bench dataset."""
+        url = "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl"
+        download_and_cache_file(url, filename="mtbench.jsonl")
+        questions_data = list(read_jsonl("mtbench.jsonl"))
+        questions_data = questions_data
+        questions = [
+            {"question_1": q["turns"][0], "question_2": q["turns"][1]}
+            for q in questions_data
+        ]
+        # MT-Bench doesn't have labels for accuracy computation
+        labels = [None] * len(questions)
+        if self.num_samples is not None:
+            questions = questions[: self.num_samples]
+            labels = labels[: self.num_samples]
+        return questions, labels
+    def create_sgl_function(self):
+        """Create SGL function for MT-Bench (2-turn conversation)."""
+        return create_multi_turn_sgl_function(
+            function_name="answer_mt_bench",
+            system_prompt=SYSTEM_PROMPT,
+            num_turns=2,
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def get_answer_keys(self) -> List[str]:
+        """Return answer keys for multi-turn conversation."""
+        return ["answer_1", "answer_2"]

progress/SpecForge/benchmarks/benchmarker/registry.py ADDED Viewed

	@@ -0,0 +1,31 @@

+class BenchmarkRegistry:
+    def __init__(self):
+        self.benchmarks = {}
+    def register(self, name: str):
+        """
+        Usage:
+        ```python
+            BENCHMARKS = BenchmarkRegistry()
+            BENCHMARKS.register("aime")
+            class AIMEBenchmarker(Benchmarker):
+                ...
+        ```
+        """
+        def wrapper(cls):
+            self.benchmarks[name] = cls
+            return cls
+        return wrapper
+    def get(self, name: str) -> type:
+        """
+        Get the benchmark class by name.
+        """
+        return self.benchmarks[name]
+BENCHMARKS = BenchmarkRegistry()

progress/SpecForge/benchmarks/benchmarker/simpleqa.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def generate_question(row: Dict[str, Any]) -> str:
+    question = row["problem"].strip()
+    return question
+@BENCHMARKS.register("simpleqa")
+class SimpleQABenchmarker(Benchmarker):
+    """SimpleQA benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        # Read data
+        ds = load_dataset("basicv8vc/SimpleQA")["test"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(None)
+        return questions, labels
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_simpleqa_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

progress/SpecForge/benchmarks/benchmarker/utils.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Utility functions for benchmark scripts.
+"""
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional
+import numpy as np
+import sglang as sgl
+@dataclass
+class BenchmarkMetrics:
+    """Container for benchmark performance metrics."""
+    latency: float
+    output_throughput: float
+    accept_length: float
+    accuracy: Optional[float] = None
+    num_questions: int = 0
+    num_valid_predictions: int = 0
+    categorical_performance: Optional[Dict[str, "BenchmarkMetrics"]] = None
+def compute_metrics(
+    states: List[Any],
+    latency: float,
+    answer_key: str = "answer",
+    additional_answer_keys: Optional[List[str]] = None,
+) -> BenchmarkMetrics:
+    """
+    Compute performance metrics from SGLang states.
+    Args:
+        states: List of SGLang state objects from run_batch
+        latency: Total latency in seconds
+        answer_key: Primary key for answer in state meta info
+        additional_answer_keys: Additional keys to include in token count (e.g., ["answer_1", "answer_2"])
+    Returns:
+        BenchmarkMetrics object with computed metrics
+    """
+    # Compute output tokens
+    num_output_tokens = 0
+    if additional_answer_keys:
+        for key in [answer_key] + additional_answer_keys:
+            num_output_tokens += sum(
+                s.get_meta_info(key)["completion_tokens"] for s in states
+            )
+    else:
+        num_output_tokens = sum(
+            s.get_meta_info(answer_key)["completion_tokens"] for s in states
+        )
+    output_throughput = num_output_tokens / latency if latency > 0 else 0.0
+    # Compute accept length (speculative decoding metric)
+    has_verify = "spec_verify_ct" in states[0].get_meta_info(answer_key)
+    if has_verify:
+        num_verify_tokens = 0
+        if additional_answer_keys:
+            for key in [answer_key] + additional_answer_keys:
+                num_verify_tokens += sum(
+                    s.get_meta_info(key).get("spec_verify_ct", 0) for s in states
+                )
+        else:
+            num_verify_tokens = sum(
+                s.get_meta_info(answer_key).get("spec_verify_ct", 0) for s in states
+            )
+        if num_verify_tokens == 0:
+            accept_length = 1.0
+        else:
+            accept_length = num_output_tokens / num_verify_tokens
+    else:
+        accept_length = 1.0
+    return BenchmarkMetrics(
+        latency=latency,
+        output_throughput=output_throughput,
+        accept_length=accept_length,
+        num_questions=len(states),
+    )
+def print_results(
+    metrics_list: List[BenchmarkMetrics],
+    benchmark_name: str,
+    show_accuracy: bool = False,
+):
+    """
+    Print benchmark results in a formatted way.
+    Args:
+        metrics_list: List of BenchmarkMetrics from multiple runs
+        benchmark_name: Name of the benchmark
+        show_accuracy: Whether to show accuracy metrics
+    """
+    avg_latency = np.mean([m.latency for m in metrics_list])
+    avg_throughput = np.mean([m.output_throughput for m in metrics_list])
+    avg_accept_length = np.mean([m.accept_length for m in metrics_list])
+    print(f"\n{'='*50}")
+    print(f"{benchmark_name} Evaluation Results")
+    print(f"{'='*50}")
+    print(f"Number of questions: {metrics_list[0].num_questions}")
+    if show_accuracy:
+        if metrics_list[0].accuracy is not None:
+            avg_accuracy = np.mean(
+                [m.accuracy for m in metrics_list if m.accuracy is not None]
+            )
+            print(f"Average Accuracy: {avg_accuracy:.4f} ({avg_accuracy*100:.2f}%)")
+        else:
+            print(f"Average Accuracy: None")
+    print(f"Average Latency: {avg_latency:.3f} s")
+    print(f"Average Output throughput: {avg_throughput:.3f} token/s")
+    print(f"Average Accept length: {avg_accept_length:.3f}")
+    print(f"{'='*50}\n")
+def create_simple_sgl_function(
+    function_name: str = "get_answer",
+    answer_key: str = "answer",
+    system_prompt: Optional[str] = None,
+    max_tokens: int = 2048,
+    stop: Optional[List[str]] = None,
+    user_prefix: Optional[str] = None,
+) -> Callable:
+    """
+    Create a simple SGL function for single-turn Q&A.
+    Args:
+        function_name: Name of the function
+        answer_key: Key for storing the answer
+        system_prompt: Optional system prompt
+        max_tokens: Maximum tokens to generate
+        stop: Optional stop sequences
+        user_prefix: Optional suffix to append to user message (appended after question)
+    Returns:
+        SGL function decorated with @sgl.function
+    """
+    @sgl.function
+    def sgl_func(s, question):
+        if system_prompt:
+            s += sgl.system(system_prompt)
+        user_content = question
+        if user_prefix:
+            user_content = question + user_prefix
+        s += sgl.user(user_content)
+        gen_kwargs = {"max_tokens": max_tokens}
+        if stop:
+            gen_kwargs["stop"] = stop
+        s += sgl.assistant(sgl.gen(answer_key, **gen_kwargs))
+    sgl_func.__name__ = function_name
+    return sgl_func
+def create_few_shot_sgl_function(
+    few_shot_examples: str,
+    function_name: str = "few_shot_answer",
+    answer_key: str = "answer",
+    max_tokens: int = 512,
+    stop: Optional[List[str]] = None,
+) -> Callable:
+    """
+    Create an SGL function for few-shot learning.
+    Args:
+        few_shot_examples: String containing few-shot examples
+        function_name: Name of the function
+        answer_key: Key for storing the answer
+        max_tokens: Maximum tokens to generate
+        stop: Optional stop sequences
+    Returns:
+        SGL function decorated with @sgl.function
+    """
+    @sgl.function
+    def sgl_func(s, question):
+        s += few_shot_examples + question
+        gen_kwargs = {"max_tokens": max_tokens}
+        if stop:
+            gen_kwargs["stop"] = stop
+        s += sgl.gen(answer_key, **gen_kwargs)
+    sgl_func.__name__ = function_name
+    return sgl_func
+def create_multi_turn_sgl_function(
+    function_name: str = "multi_turn_answer",
+    system_prompt: Optional[str] = None,
+    num_turns: int = 2,
+    max_tokens: int = 2048,
+) -> Callable:
+    """
+    Create an SGL function for multi-turn conversations (e.g., MT-Bench with 2 turns).
+    Args:
+        function_name: Name of the function
+        system_prompt: Optional system prompt
+        num_turns: Number of conversation turns (default: 2)
+        max_tokens: Maximum tokens to generate per turn
+    Returns:
+        SGL function decorated with @sgl.function
+    """
+    if num_turns == 2:
+        # Most common case: 2-turn conversation
+        @sgl.function
+        def sgl_func(s, question_1, question_2):
+            if system_prompt:
+                s += sgl.system(system_prompt)
+            s += sgl.user(question_1)
+            s += sgl.assistant(sgl.gen("answer_1", max_tokens=max_tokens))
+            s += sgl.user(question_2)
+            s += sgl.assistant(sgl.gen("answer_2", max_tokens=max_tokens))
+    else:
+        # Generic case: create function with dynamic number of turns
+        # Note: This requires the caller to pass arguments as a dict
+        @sgl.function
+        def sgl_func(s, **kwargs):
+            if system_prompt:
+                s += sgl.system(system_prompt)
+            for i in range(num_turns):
+                question_key = f"question_{i+1}"
+                answer_key = f"answer_{i+1}"
+                if question_key in kwargs:
+                    s += sgl.user(kwargs[question_key])
+                    s += sgl.assistant(sgl.gen(answer_key, max_tokens=max_tokens))
+    sgl_func.__name__ = function_name
+    return sgl_func
+def create_image_sgl_function(
+    function_name: str = "get_image_answer",
+    answer_key: str = "answer",
+    max_tokens: int = 2048,
+) -> Callable:
+    """
+    Create an SGL function for image-based Q&A.
+    Args:
+        function_name: Name of the function
+        answer_key: Key for storing the answer
+        max_tokens: Maximum tokens to generate
+    Returns:
+        SGL function decorated with @sgl.function
+    """
+    @sgl.function
+    def sgl_func(s, image_path, question, **kwargs):
+        """
+        The body of the SGL function: constructs a multimodal conversation flow.
+        - First, it inputs an image + text question as 'user'.
+        - Then, it generates an answer as 'assistant', binding the response to the specified `answer_key`.
+        Note: sgl.image() automatically encodes the image into a format supported by the model for multimodal input.
+        """
+        # User input: Image + Text question
+        s += sgl.user(sgl.image(image_path) + question)
+        s += sgl.assistant(sgl.gen(answer_key, max_tokens=max_tokens))
+    sgl_func.__name__ = function_name
+    return sgl_func

progress/SpecForge/cache/compiled_kernels/26/c26l7dxpqbfol7d62sqakxdv4rgyh27yhm4hrctevbkw5t6kekia.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_DELTA': '*fp32', 'arg_DO': '*bf16', 'arg_DQ': '*bf16', 'arg_DV': '*bf16', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_Q_NUM_BLKS': '*i32', 'arg_Q_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'arg_FULL_Q_NUM_BLKS': '*i32', 'arg_FULL_Q_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]], (13,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]], (15,): [['tt.divisibility', 16]], (16,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'Placeholder.DESCRIPTIVE_NAME', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_flex_attention_backward(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    DELTA = arg_DELTA
+    DO = arg_DO
+    DQ = arg_DQ
+    DV = arg_DV
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    Q_NUM_BLKS = arg_Q_NUM_BLKS
+    Q_IDX = arg_Q_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    FULL_Q_NUM_BLKS = arg_FULL_Q_NUM_BLKS
+    FULL_Q_IDX = arg_FULL_Q_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kd = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vd = 1024*ks1, 128, 1024, 1
+    stride_doz, stride_doh, stride_dom, stride_dod = 4096*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128, 1
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = 4096*ks0, 128, 4096, 1
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    HKV = 8
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    pid = tl.program_id(0).to(INDEX_DTYPE)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = ks2
+        stride_kv_idx_h = ks3*ks4
+        stride_kv_idx_m = ks4
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+        stride_q_num_blks_h = 6
+        stride_q_idx_h = 36
+        stride_q_idx_n = 6
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+        dk *= SM_SCALE
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
+        xindex = index_k + 128*index_n + 128*off_hkv*ks1 + 1024*off_zq*ks1
+        tl.store(out_ptr0 + (tl.broadcast_to(index_k + 128*off_hkv + 1024*index_n, dk.shape)), dk, mask)
+@triton.jit
+def bwd_dq_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
+        offs_n2 += offset
+    return dq
+@triton.jit
+def bwd_dq_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp21 = (ds)
+    grad_scores = tmp21
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = (offs_m2[:, None] < Q_LEN ) & (offs_n2[None, :] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+    return dq
+@triton.jit
+def bwd_dkdv_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
+    return dk, dv
+@triton.jit
+def bwd_dkdv_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4,
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
+    pre_mod_scores = qkT
+    tmp22 = (qkT)
+    post_mod_scores = tmp22
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp23 = (m)
+        tmp24 = tl.full([1], 0, tl.int32)
+        tmp25 = tmp23 < tmp24
+        tmp26 = (n)
+        tmp27 = tmp26 <= tmp23
+        tmp28 = tmp25 & tmp27
+        tmp29 = tmp23 >= tmp24
+        tmp30 = tmp26 < tmp24
+        tmp31 = tmp29 & tmp30
+        tmp32 = tmp30 == 0
+        tmp33 = tmp29 & tmp32
+        tmp34 = tmp23 - tmp24
+        tmp35 = tl.full([1], 16, tl.int32)
+        tmp36 = tl.where((tmp34 < 0) != (tmp35 < 0), tl.where(tmp34 % tmp35 != 0, tmp34 // tmp35 - 1, tmp34 // tmp35), tmp34 // tmp35)
+        tmp37 = tmp26 - tmp24
+        tmp38 = tl.where((tmp37 < 0) != (tmp35 < 0), tl.where(tmp37 % tmp35 != 0, tmp37 // tmp35 - 1, tmp37 // tmp35), tmp37 // tmp35)
+        tmp39 = tmp36 == tmp38
+        tmp40 = tmp33 & tmp39
+        tmp41 = tmp31 | tmp40
+        tmp42 = tmp28 | tmp41
+        mask_mod_output = tmp42
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp43 = (dsT)
+    grad_scores = tmp43
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = (offs_m1[None, :] < Q_LEN) & (offs_n1[:, None] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+    return dk, dv
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)

progress/SpecForge/cache/compiled_kernels/2d/c2d4e47kqxxnp6455gvkteqq3r336462zkbitosyeko6znxktn2b.py ADDED Viewed

	@@ -0,0 +1,879 @@

+# AOT ID: ['3_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7g/c7gxkvfztxetv7w7i4s7mr7dlsdda3dfgq3f3uijvhozq6ggk4o4.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+# Graph fragment:
+#   %arg1_1 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:3" = PlaceHolder[target=arg1_1]
+#   %arg3_1 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:3" = PlaceHolder[target=arg3_1]
+#   %arg5_1 : Tensor "bf16[1, 8, s43, 128][1024*s43, 128, 1024, 1]cuda:3" = PlaceHolder[target=arg5_1]
+#   %buf0 : Tensor "f32[1, 32, 32, s37][1024*s37, 32*s37, s37, 1]cuda:3" = PlaceHolder[target=buf0]
+#   %buf1 : Tensor "f32[1, 32, 32, s37][1024*s37, 32*s37, s37, 1]cuda:3" = PlaceHolder[target=buf1]
+#   %arg9_1 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:3" = PlaceHolder[target=arg9_1]
+#   %arg6_1 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:3" = PlaceHolder[target=arg6_1]
+#   %arg10_1 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:3" = PlaceHolder[target=arg10_1]
+#   %arg11_1 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:3" = PlaceHolder[target=arg11_1]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg1_1, %arg3_1, %arg5_1, %sdpa_score0, (%arg7_1, %arg8_1, %arg9_1, %arg6_1, %arg10_1, %arg11_1, %arg12_1, %arg13_1, %arg14_1, %arg15_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %buf2
+triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=2,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_M': '*fp32', 'arg_L': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32'}, 'device': DeviceProperties(type='cuda', index=3, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': False, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'SM_SCALE': 0.08838834764831845, 'SPLIT_KV': 32, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M': 512, 'SAFE_M_BOUNDARY': False, 'SAFE_N_BOUNDARY': True, 'BLOCK_N': 64, 'SPARSE_KV_BLOCK_SIZE': 128, 'USE_TMA': False}},
+)
+@triton.jit
+def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    SPLIT_KV : tl.constexpr = 32
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M : tl.constexpr = 512
+    SAFE_M_BOUNDARY : tl.constexpr = False
+    SAFE_N_BOUNDARY : tl.constexpr = True
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    USE_TMA : tl.constexpr = False
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    M = arg_M
+    L = arg_L
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
+    # (Modifiable) Config options:
+    # SPLIT_KV: number of blocks K & V are split into
+    # TILE_KV: length of each local KV split
+    # BLOCK_M: block size that Q is padded along seqlen dim.
+    # BLOCK_N: block size of K & V along N dimension.
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
+    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
+    #
+    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    #
+    #
+    # Output: ACC output accumulated across local KV split.
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define Q Strides
+    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = 4096*ks0, 512, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks2, 128, 1024, 1
+    stride_mz, stride_mt, stride_mh, stride_mm = 1024*ks0, 32*ks0, ks0, 1
+    stride_lz, stride_lt, stride_lh, stride_lm = 1024*ks0, 32*ks0, ks0, 1
+    Z = 1
+    ZKV = 1
+    HKV = 8
+    G: tl.constexpr = GQA_SHARED_HEADS
+    HQ = HKV * G
+    Q_LEN = ks0
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    # Make sure each split is a multiple of BLOCK_N
+    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
+    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
+    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
+    off_z = tl.program_id(0).to(INDEX_DTYPE) // HKV
+    off_zkv = off_z % ZKV
+    off_hkv = tl.program_id(0).to(INDEX_DTYPE) % HKV
+    off_t = tl.program_id(1).to(INDEX_DTYPE)
+    q_offset = off_z * stride_qz + off_hkv * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    K = K + k_offset
+    V = V + v_offset
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_z % SPARSE_Z
+    sparse_idx_h = off_hkv % SPARSE_HQ
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    # initialize offsets
+    tl.device_assert(BLOCK_M % G == 0)
+    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
+    off_g = tl.arange(0, G)                                                 # [G]
+    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_hq = offs_g + off_hkv * G
+    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
+    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_d = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_vd = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    # Get HZ offsets for KV_NUM_BLKS and KV_IDX
+    stride_block_z, stride_block_h, stride_block_row = 1, 1, 1
+    sparse_block_hz_offset = sparse_idx_z * stride_block_z + sparse_idx_h * stride_block_h
+    stride_kv_z, stride_kv_h, stride_kv_row, stride_kv_col = 1, 1, 1, 1
+    sparse_idx_hz_offset = sparse_idx_z * stride_kv_z + sparse_idx_h * stride_kv_h
+    # Calculate KV blocks that belong this CTA.
+    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
+    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
+    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
+    if not SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=(offs_d[None, None, :] < QK_HEAD_DIM) & (off_m[None, :, None] < Q_LEN))
+    elif SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=offs_d[None, None, :] < QK_HEAD_DIM)
+    elif not SAFE_M_BOUNDARY and SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=off_m[None, :, None] < Q_LEN)
+    else:
+        q = tl.load(Q + q_offset + q_range)
+    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM_ROUNDED])
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # find first kv block we are loading and the number of blocks we are loading
+    # Offset the kv_indices tensor by the correct batch and head
+    kv_indices = KV_IDX + sparse_idx_hz_offset
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_block_hz_offset)
+    MAX_KV_IDX = 1
+    indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
+    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+    # first kv block we're loading
+    # last valid block according to sparse mask
+    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+    desc_k = None
+    desc_v = None
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+        q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+        # accumulatd values
+        acc, l_i, m_i,
+        #offsets
+        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+        off_n,
+        #block sparse data
+        kv_indices, kv_num_blocks,
+        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        kv_indices = FULL_KV_IDX + sparse_idx_hz_offset
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_block_hz_offset)
+        # Assign full block in a reverse order for off_t. Prioritize the last CTA.
+        block_n_start = (SPLIT_KV - off_t - 1) * TILE_KV_MULTIPLE
+        block_n_end = block_n_start + TILE_KV_MULTIPLE
+        indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
+        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+        # last valid block according to sparse mask
+        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        offs_n = tl.arange(0, BLOCK_N) + off_n
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+            q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+            # accumulatd values
+            acc, l_i, m_i,
+            #offsets
+            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+            off_n,
+            #block sparse data
+            kv_indices, kv_num_blocks,
+            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    m_offset = off_t * stride_mt + off_z * stride_mz
+    l_offset = off_t * stride_lt + off_z * stride_lz
+    M_block_ptr = tl.make_block_ptr(
+        base=M + m_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_mh, stride_mm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    L_block_ptr = tl.make_block_ptr(
+        base=L + l_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_lh, stride_lm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
+    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
+    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
+    if SAFE_M_BOUNDARY:
+        tl.store(M_block_ptr, m_i)
+        tl.store(L_block_ptr, l_i)
+    else:
+        tl.store(M_block_ptr, m_i, boundary_check=(1,))
+        tl.store(L_block_ptr, l_i, boundary_check=(1,))
+    # -- store output
+    idx_z = off_z
+    idx_t = off_t
+    idx_hq = off_hkv*G + off_g[:, None, None]
+    idx_m = off_m[None, :, None]
+    idx_d = offs_vd[None, None, :]
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_t*ks0 + 131072*idx_z*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_t*ks0, acc.shape)), acc, mask)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    SPLIT_KV : tl.constexpr = 32
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M : tl.constexpr = 512
+    SAFE_M_BOUNDARY : tl.constexpr = False
+    SAFE_N_BOUNDARY : tl.constexpr = True
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    USE_TMA : tl.constexpr = False
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    SPLIT_KV : tl.constexpr = 32
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M : tl.constexpr = 512
+    SAFE_M_BOUNDARY : tl.constexpr = False
+    SAFE_N_BOUNDARY : tl.constexpr = True
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    USE_TMA : tl.constexpr = False
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py
+# Topologically Sorted Source Nodes: [flex_attention, lse_scaled], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+#   lse_scaled => mul_9
+# Graph fragment:
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %buf4 : Tensor  = PlaceHolder[target=buf4]
+#   %buf5 : Tensor "f32[1, 1, 32, s37][32*s37, 32*s37, s37, 1]cuda:3" = PlaceHolder[target=buf5]
+#   %buf7 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:3" = PlaceHolder[target=buf7]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg1_1, %arg3_1, %arg5_1, %sdpa_score0, (%arg7_1, %arg8_1, %arg9_1, %arg6_1, %arg10_1, %arg11_1, %arg12_1, %arg13_1, %arg14_1, %arg15_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   %mul_9 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:3"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%getitem_1, 0.6931471805599453), kwargs = {})
+#   return %buf5,%buf7,%mul_9
+triton_per_fused_mul_1 = async_compile.triton('triton_per_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 4096, 'r0_': 32},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=3, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': None, 'num_load': 2, 'num_reduction': 2, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused_mul_1(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 32
+    R0_BLOCK: tl.constexpr = 32
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    x2 = (xindex % ks0)
+    x3 = triton_helpers.div_floor_integer(xindex,  ks0)
+    tmp0 = tl.load(in_ptr0 + (x0 + 32*ks0*r0_1), xmask, other=0.0)
+    tmp5 = tl.load(in_ptr1 + (x0 + 32*ks0*r0_1), xmask, other=0.0)
+    tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
+    tmp3 = tl.where(xmask, tmp1, float("-inf"))
+    tmp4 = triton_helpers.max2(tmp3, 1)[:, None].to(tl.float32)
+    tmp6 = float("-inf")
+    tmp7 = tmp4 == tmp6
+    tmp8 = tmp0 - tmp4
+    tmp9 = 0.0
+    tmp10 = tl.where(tmp7, tmp9, tmp8)
+    tmp11 = libdevice.exp2(tmp10)
+    tmp12 = tmp5 * tmp11
+    tmp13 = tl.broadcast_to(tmp12, [XBLOCK, R0_BLOCK])
+    tmp15 = tl.where(xmask, tmp13, 0)
+    tmp16 = tl.sum(tmp15, 1)[:, None].to(tl.float32)
+    tmp17 = 1.0
+    tmp18 = tl.where(tmp7, tmp17, tmp16)
+    tmp19 = libdevice.log2(tmp18)
+    tmp20 = tmp19 + tmp4
+    tmp21 = 0.6931471805599453
+    tmp22 = tmp20 * tmp21
+    tl.store(out_ptr2 + (x2 + x3*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), tmp22, xmask)
+    tl.store(out_ptr0 + (x0), tmp4, xmask)
+    tl.store(out_ptr1 + (x0), tmp16, xmask)
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jt/cjtngjzio5oudkq4n4xggwz5enmgujrff3ktfnon7oykgb7as5tu.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention, getitem
+# Graph fragment:
+#   %buf2 : Tensor "f32[1, 32, 32, s37, 128][131072*s37, 4096*s37, 128*s37, 128, 1]cuda:3" = PlaceHolder[target=buf2]
+#   %buf5 : Tensor "f32[1, 1, 32, s37][32*s37, 32*s37, s37, 1]cuda:3" = PlaceHolder[target=buf5]
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %buf8 : Tensor "f32[1, 32, s37, 128][4096*s37, 128*s37, 128, 1]cuda:3" = PlaceHolder[target=buf8]
+#   %buf7 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:3" = PlaceHolder[target=buf7]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg1_1, %arg3_1, %arg5_1, %sdpa_score0, (%arg7_1, %arg8_1, %arg9_1, %arg6_1, %arg10_1, %arg11_1, %arg12_1, %arg13_1, %arg14_1, %arg15_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   %getitem : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:3"[num_users=1] = call_function[target=operator.getitem](args = (%flex_attention, 0), kwargs = {})
+#   return %buf8,%getitem
+triton_per_fused_2 = async_compile.triton('triton_per_fused_2', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 524288, 'r0_': 32},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i64', 'ks1': 'i64', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=3, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': None, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr1, ks0, ks1, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 32
+    R0_BLOCK: tl.constexpr = 32
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_2 = r0_index
+    x5 = xindex
+    x1 = xindex // 128
+    x0 = (xindex % 128)
+    x3 = ((xindex // 128) % ks0)
+    x4 = xindex // ks1
+    tmp0 = tl.load(in_ptr0 + (x5 + 4096*ks0*r0_2), None)
+    tmp1 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last')
+    tmp4 = tl.load(in_ptr2 + (x1 + 32*ks0*r0_2), None, eviction_policy='evict_last')
+    tmp13 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last')
+    tmp2 = float("-inf")
+    tmp3 = tmp1 == tmp2
+    tmp5 = tmp4 - tmp1
+    tmp6 = 0.0
+    tmp7 = tl.where(tmp3, tmp6, tmp5)
+    tmp8 = libdevice.exp2(tmp7)
+    tmp9 = tmp0 * tmp8
+    tmp10 = tl.broadcast_to(tmp9, [XBLOCK, R0_BLOCK])
+    tmp12 = tl.sum(tmp10, 1)[:, None].to(tl.float32)
+    tmp14 = 1.0
+    tmp15 = tl.where(tmp3, tmp14, tmp13)
+    tmp16 = (tmp12 / tmp15)
+    tmp17 = tmp16.to(tl.float32)
+    tl.store(out_ptr1 + (x0 + 128*x4 + 4096*x3), tmp17, None)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1 = args
+        args.clear()
+        s50 = arg0_1
+        s0 = arg2_1
+        s43 = arg4_1
+        s37 = arg7_1
+        s71 = arg8_1
+        assert_size_stride(arg1_1, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(arg3_1, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(arg5_1, (1, 8, s43, 128), (1024*s43, 128, 1024, 1))
+        assert_size_stride(arg6_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg9_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg10_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg11_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg12_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg13_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg14_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg15_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        with torch.cuda._DeviceGuard(3):
+            torch.cuda.set_device(3)
+            buf0 = empty_strided_cuda((1, 32, 32, s37), (1024*s37, 32*s37, s37, 1), torch.float32)
+            buf1 = empty_strided_cuda((1, 32, 32, s37), (1024*s37, 32*s37, s37, 1), torch.float32)
+            buf2 = empty_strided_cuda((1, 32, 32, s37, 128), (131072*s37, 4096*s37, 128*s37, 128, 1), torch.float32)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            stream3 = get_raw_stream(3)
+            triton_tem_fused_0.run(arg1_1, arg3_1, arg5_1, buf0, buf1, arg9_1, arg6_1, arg10_1, arg11_1, buf2, s37, s0, s43, 8, 32, 1, stream=stream3)
+            del arg10_1
+            del arg11_1
+            del arg1_1
+            del arg3_1
+            del arg5_1
+            del arg6_1
+            del arg9_1
+            buf5 = empty_strided_cuda((1, 1, 32, s37), (32*s37, 32*s37, s37, 1), torch.float32)
+            buf7 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf10 = empty_strided_cuda((1, 32, s37), (32*max(1, s37), max(1, s37), 1), torch.float32)
+            # Topologically Sorted Source Nodes: [flex_attention, lse_scaled], Original ATen: [aten.mul]
+            triton_per_fused_mul_1_xnumel = 32*s37
+            stream3 = get_raw_stream(3)
+            triton_per_fused_mul_1.run(buf0, buf1, buf5, buf7, buf10, s37, triton_per_fused_mul_1_xnumel, 32, stream=stream3)
+            del buf1
+            ps0 = 128*s37
+            buf9 = empty_strided_cuda((1, 32, s37, 128), (4096*s37, 128, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            triton_per_fused_2_xnumel = 4096*s37
+            stream3 = get_raw_stream(3)
+            triton_per_fused_2.run(buf2, buf5, buf0, buf7, buf9, s37, ps0, triton_per_fused_2_xnumel, 32, stream=stream3)
+            del buf0
+            del buf2
+            del buf5
+            del buf7
+        return (buf9, buf10, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 96
+    arg1_1 = rand_strided((1, 32, 96, 128), (393216, 128, 4096, 1), device='cuda:3', dtype=torch.bfloat16)
+    arg2_1 = 96
+    arg3_1 = rand_strided((1, 8, 96, 128), (98304, 128, 1024, 1), device='cuda:3', dtype=torch.bfloat16)
+    arg4_1 = 96
+    arg5_1 = rand_strided((1, 8, 96, 128), (98304, 128, 1024, 1), device='cuda:3', dtype=torch.bfloat16)
+    arg6_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg7_1 = 96
+    arg8_1 = 96
+    arg9_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg10_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg11_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg12_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg13_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg14_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:3', dtype=torch.int32)
+    arg15_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:3', dtype=torch.int32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/2g/c2gswut4q57fp2ueybipg5qfqiy5coitofujwdnvqdwhr7nbvnyq.py ADDED Viewed

	@@ -0,0 +1,534 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_MAX': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32'}, 'device': DeviceProperties(type='cuda', index=2, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'Placeholder.DESCRIPTIVE_NAME', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'USE_TMA': False, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_flex_attention(arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    MAX = arg_MAX
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    stride_kv_num_blks_h = 1
+    stride_kv_idx_h = 1
+    stride_kv_idx_m = 1
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    # K and V pointers will be passed directly to forward_inner
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_zq*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 4096*idx_m, acc.shape)), acc, mask)
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i

progress/SpecForge/cache/compiled_kernels/2j/4b74fa21eaaf86b6290185f6fe50aec9b905d858a087238ceddb52477f3f6acb.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b159e4046c056f195ca1ccf2464d5b37d1", "found_by_coordesc": false, "time_taken_ms": 11, "triton_cache_hash": "2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA"}

progress/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 4096},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=2, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), tmp2, xmask)

progress/SpecForge/cache/compiled_kernels/2n/c2ngvuchx6agpdr6v7awl3qgblaehfzaauoxn6camwvtk7syoxsk.py ADDED Viewed

	@@ -0,0 +1,715 @@

+# AOT ID: ['4_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6u/c6uror2yjtc6vpcc3on3oq3lwi6yghlxrmwz5rocw5haxvfiz47e.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+# Graph fragment:
+#   %arg1_1 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:2" = PlaceHolder[target=arg1_1]
+#   %arg3_1 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:2" = PlaceHolder[target=arg3_1]
+#   %arg5_1 : Tensor "bf16[1, 8, s43, 128][1024*s43, 128, 1024, 1]cuda:2" = PlaceHolder[target=arg5_1]
+#   %getitem_1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:2" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:2" = PlaceHolder[target=buf1]
+#   %arg9_1 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:2" = PlaceHolder[target=arg9_1]
+#   %arg6_1 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:2" = PlaceHolder[target=arg6_1]
+#   %arg10_1 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:2" = PlaceHolder[target=arg10_1]
+#   %arg11_1 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:2" = PlaceHolder[target=arg11_1]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg1_1, %arg3_1, %arg5_1, %sdpa_score0, (%arg7_1, %arg8_1, %arg9_1, %arg6_1, %arg10_1, %arg11_1, %arg12_1, %arg13_1, %arg14_1, %arg15_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %getitem
+triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_MAX': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32'}, 'device': DeviceProperties(type='cuda', index=2, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'USE_TMA': False, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    MAX = arg_MAX
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks2, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    stride_kv_num_blks_h = 1
+    stride_kv_idx_h = 1
+    stride_kv_idx_m = 1
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    # K and V pointers will be passed directly to forward_inner
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_zq*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 4096*idx_m, acc.shape)), acc, mask)
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sc/cscnwzzlpcjsqvndc4tlfwact2ecwdimqtwu2vya2cnto5t7c7pi.py
+# Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+#   lse_scaled => mul_9
+# Graph fragment:
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %mul_9 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:2"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%getitem_1, 0.6931471805599453), kwargs = {})
+#   return %mul_9
+triton_poi_fused_mul_1 = async_compile.triton('triton_poi_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 4096},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=2, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1 = args
+        args.clear()
+        s50 = arg0_1
+        s0 = arg2_1
+        s43 = arg4_1
+        s37 = arg7_1
+        s71 = arg8_1
+        assert_size_stride(arg1_1, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(arg3_1, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(arg5_1, (1, 8, s43, 128), (1024*s43, 128, 1024, 1))
+        assert_size_stride(arg6_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg9_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg10_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg11_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg12_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg13_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg14_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg15_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        with torch.cuda._DeviceGuard(2):
+            torch.cuda.set_device(2)
+            buf0 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf1 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf2 = empty_strided_cuda((1, 32, s37, 128), (4096*s37, 128, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            stream2 = get_raw_stream(2)
+            triton_tem_fused_0.run(arg1_1, arg3_1, arg5_1, buf0, buf1, arg9_1, arg6_1, arg10_1, arg11_1, buf2, s37, s0, s43, (127 + s37) // 128, 1, 32, stream=stream2)
+            del arg10_1
+            del arg11_1
+            del arg1_1
+            del arg3_1
+            del arg5_1
+            del arg6_1
+            del arg9_1
+            del buf1
+            buf5 = empty_strided_cuda((1, 32, s37), (32*max(1, s37), max(1, s37), 1), torch.float32)
+            # Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+            triton_poi_fused_mul_1_xnumel = 32*s37
+            stream2 = get_raw_stream(2)
+            triton_poi_fused_mul_1.run(buf0, buf5, s37, triton_poi_fused_mul_1_xnumel, stream=stream2)
+            del buf0
+        return (buf2, buf5, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 128
+    arg1_1 = rand_strided((1, 32, 128, 128), (524288, 128, 4096, 1), device='cuda:2', dtype=torch.bfloat16)
+    arg2_1 = 128
+    arg3_1 = rand_strided((1, 8, 128, 128), (131072, 128, 1024, 1), device='cuda:2', dtype=torch.bfloat16)
+    arg4_1 = 128
+    arg5_1 = rand_strided((1, 8, 128, 128), (131072, 128, 1024, 1), device='cuda:2', dtype=torch.bfloat16)
+    arg6_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg7_1 = 128
+    arg8_1 = 128
+    arg9_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg10_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg11_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg12_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg13_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg14_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:2', dtype=torch.int32)
+    arg15_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:2', dtype=torch.int32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/2n/c2nooi7ekpz4qvmvghggbegd5cyfspb27jmq2snbi26zbrpoibnx.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 4096, 'r0_': 128},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr1': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=1, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': None, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr1, ks0, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 128
+    R0_BLOCK: tl.constexpr = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_2 = r0_index
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x1 + 4096*x0), xmask, other=0.0).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (r0_2 + 128*x0 + 128*x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), xmask, other=0.0).to(tl.float32)
+    tmp8 = tl.load(in_ptr2 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), xmask, eviction_policy='evict_last')
+    tmp2 = tmp0 * tmp1
+    tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+    tmp5 = tl.where(xmask, tmp3, 0)
+    tmp6 = tl.sum(tmp5, 1)[:, None].to(tl.float32)
+    tmp7 = tmp6.to(tl.float32)
+    tmp9 = 0.6931471805599453
+    tmp10 = tmp8 * tmp9
+    tmp11 = 1.4426950408889634
+    tmp12 = tmp10 * tmp11
+    tmp13 = tmp7 - tmp12
+    tl.store(out_ptr1 + (x3), tmp13, xmask)

progress/SpecForge/cache/compiled_kernels/2n/d17ff4e7bb44e5ae89a267ef332bb7c074804ce0942fc0694c3ef15b05f7854a.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 8, "num_warps": 8, "num_stages": 1, "configs_hash": "22b8c9e89632e6687ce26aaad980a76bbf5ee683fff317f3a6d7989c7528ff63", "found_by_coordesc": false, "time_taken_ms": 18, "triton_cache_hash": "WJHIHLPATQZBKSQZSWJ5BD3ABYGFUF3YD6VF633RGCNWMMKVXCCA"}

progress/SpecForge/cache/compiled_kernels/2o/c2oashzxz74kzyuwo67tuhk32cike37ysabriftachdv7lf2qxgs.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_DELTA': '*fp32', 'arg_DO': '*bf16', 'arg_DQ': '*bf16', 'arg_DV': '*bf16', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_Q_NUM_BLKS': '*i32', 'arg_Q_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'arg_FULL_Q_NUM_BLKS': '*i32', 'arg_FULL_Q_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32', 'ks5': 'i32', 'ks6': 'i32', 'ks7': 'i32'}, 'device': DeviceProperties(type='cuda', index=2, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]], (13,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]], (15,): [['tt.divisibility', 16]], (16,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_mul_1', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_mul_1(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    DELTA = arg_DELTA
+    DO = arg_DO
+    DQ = arg_DQ
+    DV = arg_DV
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    Q_NUM_BLKS = arg_Q_NUM_BLKS
+    Q_IDX = arg_Q_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    FULL_Q_NUM_BLKS = arg_FULL_Q_NUM_BLKS
+    FULL_Q_IDX = arg_FULL_Q_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kd = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vd = 1024*ks1, 128, 1024, 1
+    stride_doz, stride_doh, stride_dom, stride_dod = 4096*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128, 1
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = 4096*ks0, 128, 4096, 1
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    HKV = 8
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    pid = tl.program_id(0).to(INDEX_DTYPE)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = ks2
+        stride_kv_idx_h = ks3*ks4
+        stride_kv_idx_m = ks4
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+        stride_q_num_blks_h = ks5
+        stride_q_idx_h = ks6*ks7
+        stride_q_idx_n = ks6
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+        dk *= SM_SCALE
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
+        xindex = index_k + 128*index_n + 128*off_hkv*ks1 + 1024*off_zq*ks1
+        tl.store(out_ptr0 + (tl.broadcast_to(index_k + 128*off_hkv + 1024*index_n, dk.shape)), dk, mask)
+@triton.jit
+def bwd_dq_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
+        offs_n2 += offset
+    return dq
+@triton.jit
+def bwd_dq_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp21 = (ds)
+    grad_scores = tmp21
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = (offs_m2[:, None] < Q_LEN ) & (offs_n2[None, :] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+    return dq
+@triton.jit
+def bwd_dkdv_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
+    return dk, dv
+@triton.jit
+def bwd_dkdv_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
+    pre_mod_scores = qkT
+    tmp22 = (qkT)
+    post_mod_scores = tmp22
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp23 = (m)
+        tmp24 = tl.full([1], 0, tl.int32)
+        tmp25 = tmp23 < tmp24
+        tmp26 = (n)
+        tmp27 = tmp26 <= tmp23
+        tmp28 = tmp25 & tmp27
+        tmp29 = tmp23 >= tmp24
+        tmp30 = tmp26 < tmp24
+        tmp31 = tmp29 & tmp30
+        tmp32 = tmp30 == 0
+        tmp33 = tmp29 & tmp32
+        tmp34 = tmp23 - tmp24
+        tmp35 = tl.full([1], 16, tl.int32)
+        tmp36 = tl.where((tmp34 < 0) != (tmp35 < 0), tl.where(tmp34 % tmp35 != 0, tmp34 // tmp35 - 1, tmp34 // tmp35), tmp34 // tmp35)
+        tmp37 = tmp26 - tmp24
+        tmp38 = tl.where((tmp37 < 0) != (tmp35 < 0), tl.where(tmp37 % tmp35 != 0, tmp37 // tmp35 - 1, tmp37 // tmp35), tmp37 // tmp35)
+        tmp39 = tmp36 == tmp38
+        tmp40 = tmp33 & tmp39
+        tmp41 = tmp31 | tmp40
+        tmp42 = tmp28 | tmp41
+        mask_mod_output = tmp42
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp43 = (dsT)
+    grad_scores = tmp43
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = (offs_m1[None, :] < Q_LEN) & (offs_n1[:, None] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+    return dk, dv
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)

progress/SpecForge/cache/compiled_kernels/2v/c2vob47d7sxpitzmofyr55f5hvxsitxjhpyv5hdiqcdjgbwmxk76.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_DELTA': '*fp32', 'arg_DO': '*bf16', 'arg_DQ': '*bf16', 'arg_DV': '*bf16', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_Q_NUM_BLKS': '*i32', 'arg_Q_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'arg_FULL_Q_NUM_BLKS': '*i32', 'arg_FULL_Q_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32'}, 'device': DeviceProperties(type='cuda', index=1, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]], (13,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]], (15,): [['tt.divisibility', 16]], (16,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_mul_1', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_mul_1(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    DELTA = arg_DELTA
+    DO = arg_DO
+    DQ = arg_DQ
+    DV = arg_DV
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    Q_NUM_BLKS = arg_Q_NUM_BLKS
+    Q_IDX = arg_Q_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    FULL_Q_NUM_BLKS = arg_FULL_Q_NUM_BLKS
+    FULL_Q_IDX = arg_FULL_Q_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kd = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vd = 1024*ks1, 128, 1024, 1
+    stride_doz, stride_doh, stride_dom, stride_dod = 4096*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128, 1
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = 4096*ks0, 128, 4096, 1
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    HKV = 8
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    pid = tl.program_id(0).to(INDEX_DTYPE)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = 1
+        stride_kv_idx_h = 1
+        stride_kv_idx_m = 1
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+        stride_q_num_blks_h = 1
+        stride_q_idx_h = 1
+        stride_q_idx_n = 1
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+        dk *= SM_SCALE
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
+        xindex = index_k + 128*index_n + 128*off_hkv*ks1 + 1024*off_zq*ks1
+        tl.store(out_ptr0 + (tl.broadcast_to(index_k + 128*off_hkv + 1024*index_n, dk.shape)), dk, mask)
+@triton.jit
+def bwd_dq_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
+        offs_n2 += offset
+    return dq
+@triton.jit
+def bwd_dq_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp21 = (ds)
+    grad_scores = tmp21
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = (offs_m2[:, None] < Q_LEN ) & (offs_n2[None, :] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+    return dq
+@triton.jit
+def bwd_dkdv_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
+    return dk, dv
+@triton.jit
+def bwd_dkdv_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
+    pre_mod_scores = qkT
+    tmp22 = (qkT)
+    post_mod_scores = tmp22
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp23 = (m)
+        tmp24 = tl.full([1], 0, tl.int32)
+        tmp25 = tmp23 < tmp24
+        tmp26 = (n)
+        tmp27 = tmp26 <= tmp23
+        tmp28 = tmp25 & tmp27
+        tmp29 = tmp23 >= tmp24
+        tmp30 = tmp26 < tmp24
+        tmp31 = tmp29 & tmp30
+        tmp32 = tmp30 == 0
+        tmp33 = tmp29 & tmp32
+        tmp34 = tmp23 - tmp24
+        tmp35 = tl.full([1], 16, tl.int32)
+        tmp36 = tl.where((tmp34 < 0) != (tmp35 < 0), tl.where(tmp34 % tmp35 != 0, tmp34 // tmp35 - 1, tmp34 // tmp35), tmp34 // tmp35)
+        tmp37 = tmp26 - tmp24
+        tmp38 = tl.where((tmp37 < 0) != (tmp35 < 0), tl.where(tmp37 % tmp35 != 0, tmp37 // tmp35 - 1, tmp37 // tmp35), tmp37 // tmp35)
+        tmp39 = tmp36 == tmp38
+        tmp40 = tmp33 & tmp39
+        tmp41 = tmp31 | tmp40
+        tmp42 = tmp28 | tmp41
+        mask_mod_output = tmp42
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp43 = (dsT)
+    grad_scores = tmp43
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = (offs_m1[None, :] < Q_LEN) & (offs_n1[:, None] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+    return dk, dv
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)

progress/SpecForge/cache/compiled_kernels/2y/c2yhndikcsebqfmbw7l44gmcdoyw7ogaqt7quyeygz3mp5w6u6ke.py ADDED Viewed

	@@ -0,0 +1,715 @@

+# AOT ID: ['4_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6n/c6n4rf57opno6rcuedu4jk4etcok4ti2tlaztx2ht3z5eydc3vae.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+# Graph fragment:
+#   %arg1_1 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:1" = PlaceHolder[target=arg1_1]
+#   %arg3_1 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:1" = PlaceHolder[target=arg3_1]
+#   %arg5_1 : Tensor "bf16[1, 8, s43, 128][1024*s43, 128, 1024, 1]cuda:1" = PlaceHolder[target=arg5_1]
+#   %getitem_1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:1" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:1" = PlaceHolder[target=buf1]
+#   %arg9_1 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:1" = PlaceHolder[target=arg9_1]
+#   %arg6_1 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:1" = PlaceHolder[target=arg6_1]
+#   %arg10_1 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:1" = PlaceHolder[target=arg10_1]
+#   %arg11_1 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:1" = PlaceHolder[target=arg11_1]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg1_1, %arg3_1, %arg5_1, %sdpa_score0, (%arg7_1, %arg8_1, %arg9_1, %arg6_1, %arg10_1, %arg11_1, %arg12_1, %arg13_1, %arg14_1, %arg15_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %getitem
+triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_MAX': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32'}, 'device': DeviceProperties(type='cuda', index=1, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'USE_TMA': False, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    MAX = arg_MAX
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks2, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    stride_kv_num_blks_h = 1
+    stride_kv_idx_h = 1
+    stride_kv_idx_m = 1
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    # K and V pointers will be passed directly to forward_inner
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_zq*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 4096*idx_m, acc.shape)), acc, mask)
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/qr/cqrng7hmawuvea5b46xnw26e3vaokywqdqnuhn4vt7tmtdoleeab.py
+# Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+#   lse_scaled => mul_9
+# Graph fragment:
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %mul_9 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:1"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%getitem_1, 0.6931471805599453), kwargs = {})
+#   return %mul_9
+triton_poi_fused_mul_1 = async_compile.triton('triton_poi_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 4096},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=1, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1 = args
+        args.clear()
+        s50 = arg0_1
+        s0 = arg2_1
+        s43 = arg4_1
+        s37 = arg7_1
+        s71 = arg8_1
+        assert_size_stride(arg1_1, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(arg3_1, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(arg5_1, (1, 8, s43, 128), (1024*s43, 128, 1024, 1))
+        assert_size_stride(arg6_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg9_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg10_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg11_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg12_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg13_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(arg14_1, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(arg15_1, (1, 1, 1, 1), (1, 1, 1, 1))
+        with torch.cuda._DeviceGuard(1):
+            torch.cuda.set_device(1)
+            buf0 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf1 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf2 = empty_strided_cuda((1, 32, s37, 128), (4096*s37, 128, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            stream1 = get_raw_stream(1)
+            triton_tem_fused_0.run(arg1_1, arg3_1, arg5_1, buf0, buf1, arg9_1, arg6_1, arg10_1, arg11_1, buf2, s37, s0, s43, (127 + s37) // 128, 1, 32, stream=stream1)
+            del arg10_1
+            del arg11_1
+            del arg1_1
+            del arg3_1
+            del arg5_1
+            del arg6_1
+            del arg9_1
+            del buf1
+            buf5 = empty_strided_cuda((1, 32, s37), (32*max(1, s37), max(1, s37), 1), torch.float32)
+            # Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+            triton_poi_fused_mul_1_xnumel = 32*s37
+            stream1 = get_raw_stream(1)
+            triton_poi_fused_mul_1.run(buf0, buf5, s37, triton_poi_fused_mul_1_xnumel, stream=stream1)
+            del buf0
+        return (buf2, buf5, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 128
+    arg1_1 = rand_strided((1, 32, 128, 128), (524288, 128, 4096, 1), device='cuda:1', dtype=torch.bfloat16)
+    arg2_1 = 128
+    arg3_1 = rand_strided((1, 8, 128, 128), (131072, 128, 1024, 1), device='cuda:1', dtype=torch.bfloat16)
+    arg4_1 = 128
+    arg5_1 = rand_strided((1, 8, 128, 128), (131072, 128, 1024, 1), device='cuda:1', dtype=torch.bfloat16)
+    arg6_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg7_1 = 128
+    arg8_1 = 128
+    arg9_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg10_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg11_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg12_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg13_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg14_1 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:1', dtype=torch.int32)
+    arg15_1 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:1', dtype=torch.int32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/2z/c2zdv5arszdl6ednyphqfnib6jwgzomr6zt6536b7gq75kp67uvh.py ADDED Viewed

	@@ -0,0 +1,1046 @@

+# AOT ID: ['2_backward']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sh/csh76hcjkj7bc6jvydzdmaapo6vnfxlvc3xvqexzngu63td4qnjk.py
+# Topologically Sorted Source Nodes: [], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+# Graph fragment:
+#   %getitem : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:4" = PlaceHolder[target=getitem]
+#   %tangents_1 : Tensor "bf16[1, 32, s37, 128][4096*Max(1, s37), 128*Max(1, s37), 128, 1]cuda:4" = PlaceHolder[target=tangents_1]
+#   %buf0 : Tensor "bf16[1, 32, s37][32*s37, s37, 1]cuda:4" = PlaceHolder[target=buf0]
+#   %tangents_2 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:4" = PlaceHolder[target=tangents_2]
+#   %mul_19 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:4"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%tangents_2, 0.6931471805599453), kwargs = {})
+#   %flex_attention_backward : [num_users=3] = call_function[target=torch.ops.higher_order.flex_attention_backward](args = (%primals_2, %primals_4, %primals_6, %getitem, %getitem_1, %tangents_1, %mul_19, %fw_graph0, %joint_graph0, (%primals_10, %primals_11, %primals_13, %primals_9, %primals_15, %primals_18, %primals_20, %primals_23, %primals_25, %primals_28, 128, 128, %mask_graph0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %buf0,%buf1
+triton_red_fused_mul_0 = async_compile.triton('triton_red_fused_mul_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 32768, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr1': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=4, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr1, ks0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_2 + 128*x0 + 128*x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp2 = tmp0 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask & xmask, tmp5, _tmp4)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tmp7 = tl.load(in_ptr2 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), xmask, eviction_policy='evict_last')
+    tmp6 = tmp4.to(tl.float32)
+    tmp8 = 0.6931471805599453
+    tmp9 = tmp7 * tmp8
+    tmp10 = 1.4426950408889634
+    tmp11 = tmp9 * tmp10
+    tmp12 = tmp6 - tmp11
+    tl.store(out_ptr1 + (x3), tmp12, xmask)
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ek/cekdygnnt4twwaq4fpapciid2veg5uc5gzwte4mymxe7ertv26cs.py
+# Topologically Sorted Source Nodes: [], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+# Graph fragment:
+#   %primals_2 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:4" = PlaceHolder[target=primals_2]
+#   %primals_4 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:4" = PlaceHolder[target=primals_4]
+#   %primals_6 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:4" = PlaceHolder[target=primals_6]
+#   %getitem_1 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:4" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:4" = PlaceHolder[target=buf1]
+#   %tangents_1 : Tensor "bf16[1, 32, s37, 128][4096*Max(1, s37), 128*Max(1, s37), 128, 1]cuda:4" = PlaceHolder[target=tangents_1]
+#   %getitem_3 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:4" = PlaceHolder[target=getitem_3]
+#   %getitem_5 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:4" = PlaceHolder[target=getitem_5]
+#   %primals_13 : Tensor "i32[1, 1, s99][s99, s99, 1]cuda:4" = PlaceHolder[target=primals_13]
+#   %primals_9 : Tensor "i32[1, 1, s22, s72][s22*s72, s22*s72, s72, 1]cuda:4" = PlaceHolder[target=primals_9]
+#   %primals_20 : Tensor "i32[1, 1, s56][s56, s56, 1]cuda:4" = PlaceHolder[target=primals_20]
+#   %primals_23 : Tensor "i32[1, 1, s84, s53][s53*s84, s53*s84, s53, 1]cuda:4" = PlaceHolder[target=primals_23]
+#   %primals_15 : Tensor "i32[1, 1, s94][s94, s94, 1]cuda:4" = PlaceHolder[target=primals_15]
+#   %primals_18 : Tensor "i32[1, 1, s28, s4][s28*s4, s28*s4, s4, 1]cuda:4" = PlaceHolder[target=primals_18]
+#   %primals_25 : Tensor "i32[1, 1, s100][s100, s100, 1]cuda:4" = PlaceHolder[target=primals_25]
+#   %primals_28 : Tensor "i32[1, 1, s5, s10][s10*s5, s10*s5, s10, 1]cuda:4" = PlaceHolder[target=primals_28]
+#   %mul_19 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:4"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%tangents_2, 0.6931471805599453), kwargs = {})
+#   %flex_attention_backward : [num_users=3] = call_function[target=torch.ops.higher_order.flex_attention_backward](args = (%primals_2, %primals_4, %primals_6, %getitem, %getitem_1, %tangents_1, %mul_19, %fw_graph0, %joint_graph0, (%primals_10, %primals_11, %primals_13, %primals_9, %primals_15, %primals_18, %primals_20, %primals_23, %primals_25, %primals_28, 128, 128, %mask_graph0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %getitem_4
+triton_tem_fused_mul_1 = async_compile.triton('triton_tem_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_DELTA': '*fp32', 'arg_DO': '*bf16', 'arg_DQ': '*bf16', 'arg_DV': '*bf16', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_Q_NUM_BLKS': '*i32', 'arg_Q_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'arg_FULL_Q_NUM_BLKS': '*i32', 'arg_FULL_Q_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32', 'ks5': 'i32', 'ks6': 'i32', 'ks7': 'i32'}, 'device': DeviceProperties(type='cuda', index=4, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]], (13,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]], (15,): [['tt.divisibility', 16]], (16,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_mul_1', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_mul_1(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    DELTA = arg_DELTA
+    DO = arg_DO
+    DQ = arg_DQ
+    DV = arg_DV
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    Q_NUM_BLKS = arg_Q_NUM_BLKS
+    Q_IDX = arg_Q_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    FULL_Q_NUM_BLKS = arg_FULL_Q_NUM_BLKS
+    FULL_Q_IDX = arg_FULL_Q_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kd = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vd = 1024*ks1, 128, 1024, 1
+    stride_doz, stride_doh, stride_dom, stride_dod = 4096*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128, 1
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = 4096*ks0, 128, 4096, 1
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    HKV = 8
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    pid = tl.program_id(0).to(INDEX_DTYPE)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = ks2
+        stride_kv_idx_h = ks3*ks4
+        stride_kv_idx_m = ks4
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+        stride_q_num_blks_h = ks5
+        stride_q_idx_h = ks6*ks7
+        stride_q_idx_n = ks6
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+        dk *= SM_SCALE
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
+        xindex = index_k + 128*index_n + 128*off_hkv*ks1 + 1024*off_zq*ks1
+        tl.store(out_ptr0 + (tl.broadcast_to(index_k + 128*off_hkv + 1024*index_n, dk.shape)), dk, mask)
+@triton.jit
+def bwd_dq_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
+        offs_n2 += offset
+    return dq
+@triton.jit
+def bwd_dq_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp21 = (ds)
+    grad_scores = tmp21
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = (offs_m2[:, None] < Q_LEN ) & (offs_n2[None, :] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+    return dq
+@triton.jit
+def bwd_dkdv_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
+    return dk, dv
+@triton.jit
+def bwd_dkdv_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, ks7,
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
+    pre_mod_scores = qkT
+    tmp22 = (qkT)
+    post_mod_scores = tmp22
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp23 = (m)
+        tmp24 = tl.full([1], 0, tl.int32)
+        tmp25 = tmp23 < tmp24
+        tmp26 = (n)
+        tmp27 = tmp26 <= tmp23
+        tmp28 = tmp25 & tmp27
+        tmp29 = tmp23 >= tmp24
+        tmp30 = tmp26 < tmp24
+        tmp31 = tmp29 & tmp30
+        tmp32 = tmp30 == 0
+        tmp33 = tmp29 & tmp32
+        tmp34 = tmp23 - tmp24
+        tmp35 = tl.full([1], 16, tl.int32)
+        tmp36 = tl.where((tmp34 < 0) != (tmp35 < 0), tl.where(tmp34 % tmp35 != 0, tmp34 // tmp35 - 1, tmp34 // tmp35), tmp34 // tmp35)
+        tmp37 = tmp26 - tmp24
+        tmp38 = tl.where((tmp37 < 0) != (tmp35 < 0), tl.where(tmp37 % tmp35 != 0, tmp37 // tmp35 - 1, tmp37 // tmp35), tmp37 // tmp35)
+        tmp39 = tmp36 == tmp38
+        tmp40 = tmp33 & tmp39
+        tmp41 = tmp31 | tmp40
+        tmp42 = tmp28 | tmp41
+        mask_mod_output = tmp42
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp43 = (dsT)
+    grad_scores = tmp43
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = (offs_m1[None, :] < Q_LEN) & (offs_n1[:, None] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+    return dk, dv
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        primals_10, primals_11, primals_7, primals_8, primals_12, primals_14, primals_16, primals_17, primals_19, primals_22, primals_21, primals_24, primals_27, primals_26, primals_2, primals_4, primals_6, primals_9, primals_13, primals_15, primals_18, primals_20, primals_23, primals_25, primals_28, getitem, getitem_1, tangents_1, tangents_2 = args
+        args.clear()
+        s37 = primals_10
+        s0 = primals_11
+        s22 = primals_7
+        s72 = primals_8
+        s99 = primals_12
+        s94 = primals_14
+        s28 = primals_16
+        s4 = primals_17
+        s56 = primals_19
+        s53 = primals_22
+        s84 = primals_21
+        s100 = primals_24
+        s10 = primals_27
+        s5 = primals_26
+        assert_size_stride(primals_2, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(primals_4, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(primals_6, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(primals_9, (1, 1, s22, s72), (s22*s72, s22*s72, s72, 1))
+        assert_size_stride(primals_13, (1, 1, s99), (s99, s99, 1))
+        assert_size_stride(primals_15, (1, 1, s94), (s94, s94, 1))
+        assert_size_stride(primals_18, (1, 1, s28, s4), (s28*s4, s28*s4, s4, 1))
+        assert_size_stride(primals_20, (1, 1, s56), (s56, s56, 1))
+        assert_size_stride(primals_23, (1, 1, s84, s53), (s53*s84, s53*s84, s53, 1))
+        assert_size_stride(primals_25, (1, 1, s100), (s100, s100, 1))
+        assert_size_stride(primals_28, (1, 1, s5, s10), (s10*s5, s10*s5, s10, 1))
+        assert_size_stride(getitem, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(getitem_1, (1, 32, s37), (32*max(1, s37), max(1, s37), 1))
+        assert_size_stride(tangents_1, (1, 32, s37, 128), (4096*max(1, s37), 128*max(1, s37), 128, 1))
+        assert_size_stride(tangents_2, (1, 32, s37), (32*max(1, s37), max(1, s37), 1))
+        with torch.cuda._DeviceGuard(4):
+            torch.cuda.set_device(4)
+            buf1 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            # Topologically Sorted Source Nodes: [], Original ATen: [aten.mul]
+            triton_red_fused_mul_0_xnumel = 32*s37
+            stream4 = get_raw_stream(4)
+            triton_red_fused_mul_0.run(getitem, tangents_1, tangents_2, buf1, s37, triton_red_fused_mul_0_xnumel, 128, stream=stream4)
+            del getitem
+            del tangents_2
+            buf3 = empty_strided_cuda((1, 32, s37, 128), (4096*s37, 128, 4096, 1), torch.bfloat16)
+            buf4 = empty_strided_cuda((1, 8, s0, 128), (1024*s0, 128, 1024, 1), torch.bfloat16)
+            buf5 = empty_strided_cuda((1, 8, s0, 128), (1024*s0, 128, 1024, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [], Original ATen: [aten.mul]
+            stream4 = get_raw_stream(4)
+            triton_tem_fused_mul_1.run(primals_2, primals_4, primals_6, getitem_1, buf1, tangents_1, buf3, buf4, primals_13, primals_9, primals_20, primals_23, primals_15, primals_18, primals_25, primals_28, buf5, s37, s0, s99, s22, s72, s56, s53, s84, 4*((127 + s37) // 128) + ((127 + s0) // 128), 1, 8, stream=stream4)
+            del buf1
+            del getitem_1
+            del primals_13
+            del primals_15
+            del primals_18
+            del primals_2
+            del primals_20
+            del primals_23
+            del primals_25
+            del primals_28
+            del primals_4
+            del primals_6
+            del primals_9
+            del tangents_1
+        return (None, buf3, None, buf5, None, buf4, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    primals_10 = 960
+    primals_11 = 960
+    primals_7 = 8
+    primals_8 = 8
+    primals_12 = 8
+    primals_14 = 8
+    primals_16 = 8
+    primals_17 = 8
+    primals_19 = 8
+    primals_22 = 8
+    primals_21 = 8
+    primals_24 = 8
+    primals_27 = 8
+    primals_26 = 8
+    primals_2 = rand_strided((1, 32, 960, 128), (3932160, 128, 4096, 1), device='cuda:4', dtype=torch.bfloat16)
+    primals_4 = rand_strided((1, 8, 960, 128), (983040, 128, 1024, 1), device='cuda:4', dtype=torch.bfloat16)
+    primals_6 = rand_strided((1, 8, 960, 128), (983040, 128, 1024, 1), device='cuda:4', dtype=torch.bfloat16)
+    primals_9 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_13 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_15 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_18 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_20 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_23 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_25 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:4', dtype=torch.int32)
+    primals_28 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:4', dtype=torch.int32)
+    getitem = rand_strided((1, 32, 960, 128), (3932160, 128, 4096, 1), device='cuda:4', dtype=torch.bfloat16)
+    getitem_1 = rand_strided((1, 32, 960), (30720, 960, 1), device='cuda:4', dtype=torch.float32)
+    tangents_1 = rand_strided((1, 32, 960, 128), (3932160, 122880, 128, 1), device='cuda:4', dtype=torch.bfloat16)
+    tangents_2 = rand_strided((1, 32, 960), (30720, 960, 1), device='cuda:4', dtype=torch.float32)
+    fn = lambda: call([primals_10, primals_11, primals_7, primals_8, primals_12, primals_14, primals_16, primals_17, primals_19, primals_22, primals_21, primals_24, primals_27, primals_26, primals_2, primals_4, primals_6, primals_9, primals_13, primals_15, primals_18, primals_20, primals_23, primals_25, primals_28, getitem, getitem_1, tangents_1, tangents_2])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/2z/c2zqq6qyjomc7iflknbqr7yjdhjux47hzv4nnsi5qfbeqglaip2h.py ADDED Viewed

	@@ -0,0 +1,707 @@

+# AOT ID: ['4_forward']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/cf/ccftanvnrini6kruughcnjtpfiarn7zwa2sdotthpo3wbbjituv3.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+# Graph fragment:
+#   %primals_2 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:0" = PlaceHolder[target=primals_2]
+#   %primals_4 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:0" = PlaceHolder[target=primals_4]
+#   %primals_6 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:0" = PlaceHolder[target=primals_6]
+#   %getitem_1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:0" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:0" = PlaceHolder[target=buf1]
+#   %primals_10 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:0" = PlaceHolder[target=primals_10]
+#   %primals_7 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:0" = PlaceHolder[target=primals_7]
+#   %primals_11 : Tensor "i32[1, 1, 1][1, 1, 1]cuda:0" = PlaceHolder[target=primals_11]
+#   %primals_12 : Tensor "i32[1, 1, 1, 1][1, 1, 1, 1]cuda:0" = PlaceHolder[target=primals_12]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%primals_2, %primals_4, %primals_6, %sdpa_score0, (%primals_8, %primals_9, %primals_10, %primals_7, %primals_11, %primals_12, %primals_13, %primals_14, %primals_15, %primals_16, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %getitem
+triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_MAX': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'USE_TMA': False, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    MAX = arg_MAX
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    stride_kv_num_blks_h = 1
+    stride_kv_idx_h = 1
+    stride_kv_idx_m = 1
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    # K and V pointers will be passed directly to forward_inner
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_zq*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 4096*idx_m, acc.shape)), acc, mask)
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/uu/cuu2rr2yygwarlbfvcbucg7erbfsky4wxudbfsdny5wzgxewg4ut.py
+# Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+#   lse_scaled => mul_15
+# Graph fragment:
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %mul_15 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%getitem_1, 0.6931471805599453), kwargs = {})
+#   return %mul_15
+triton_poi_fused_mul_1 = async_compile.triton('triton_poi_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 4096},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16 = args
+        args.clear()
+        s50 = primals_1
+        s0 = primals_3
+        s43 = primals_5
+        s37 = primals_8
+        s71 = primals_9
+        assert_size_stride(primals_2, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(primals_4, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(primals_6, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(primals_7, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(primals_10, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(primals_11, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(primals_12, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(primals_13, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(primals_14, (1, 1, 1, 1), (1, 1, 1, 1))
+        assert_size_stride(primals_15, (1, 1, 1), (1, 1, 1))
+        assert_size_stride(primals_16, (1, 1, 1, 1), (1, 1, 1, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf1 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf2 = empty_strided_cuda((1, 32, s37, 128), (4096*s37, 128, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            stream0 = get_raw_stream(0)
+            triton_tem_fused_0.run(primals_2, primals_4, primals_6, buf0, buf1, primals_10, primals_7, primals_11, primals_12, buf2, s37, s0, (127 + s37) // 128, 1, 32, stream=stream0)
+            del buf1
+            buf5 = empty_strided_cuda((1, 32, s37), (32*max(1, s37), max(1, s37), 1), torch.float32)
+            # Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+            triton_poi_fused_mul_1_xnumel = 32*s37
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_mul_1.run(buf0, buf5, s37, triton_poi_fused_mul_1_xnumel, stream=stream0)
+        return (buf2, buf5, primals_2, primals_4, primals_6, primals_7, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16, buf2, buf0, s37, s0, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    primals_1 = 128
+    primals_2 = rand_strided((1, 32, 128, 128), (524288, 128, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    primals_3 = 128
+    primals_4 = rand_strided((1, 8, 128, 128), (131072, 128, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    primals_5 = 128
+    primals_6 = rand_strided((1, 8, 128, 128), (131072, 128, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    primals_7 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_8 = 128
+    primals_9 = 128
+    primals_10 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_11 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_12 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_13 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_14 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_15 = rand_strided((1, 1, 1), (1, 1, 1), device='cuda:0', dtype=torch.int32)
+    primals_16 = rand_strided((1, 1, 1, 1), (1, 1, 1, 1), device='cuda:0', dtype=torch.int32)
+    fn = lambda: call([primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/32/8d96bbe05a966b7e7756831f09a79e31bf46fad0952af86f36d75557fc1735e8.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 512, "num_warps": 4, "num_stages": 1, "configs_hash": "7cced77f371acaa5aa7d90332a90e0c907727cfefb71d9cc9d997c24557fc44f", "found_by_coordesc": false, "time_taken_ms": 13, "triton_cache_hash": "BGHEC74L2RGBNBI3A4UJOTHXFUUKS4KY3KJKVN65FHLWR47O6USQ"}

progress/SpecForge/cache/compiled_kernels/32/c32pbcuz72bjfnkzvckfbbzlzuupc5yxl7t47b3qf74mmk5g2d2z.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 65536},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=3, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'tiling_scores': {'x': 282624}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 35328
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask)
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0), tmp2, xmask)

progress/SpecForge/cache/compiled_kernels/3b/a0a6b043ab548fdf71e72bbdf5daab7f72e9ed11a9ad9f8824a6263bb6bc5081.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 512, "num_warps": 4, "num_stages": 1, "configs_hash": "7cced77f371acaa5aa7d90332a90e0c907727cfefb71d9cc9d997c24557fc44f", "found_by_coordesc": false, "time_taken_ms": 14, "triton_cache_hash": "DSCNRRQHW6TSEFKL6AMK6FYZWMIHBTRCG2BE5YK5T7Q76TMOZ5HQ"}

progress/SpecForge/cache/compiled_kernels/3b/c3bqw7dk7k6dcdrp3ycrthotye7y6zb26752jl4lwmfgaybpvr6y.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 65536},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=5, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'tiling_scores': {'x': 348160}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 43520
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask)
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0), tmp2, xmask)

progress/SpecForge/cache/compiled_kernels/3f/3f6057605b157d44fd56f748226a63975b79198f94871188e73e46cd6c7f8792.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 128, "num_warps": 8, "num_stages": 1, "configs_hash": "1542f544a12adfb1397c535fa16687cc79c79a22e4c9cd8af0b373891f747e62", "found_by_coordesc": false, "time_taken_ms": 60, "triton_cache_hash": "XRPIXE6422Z3WVFKM6FTH3VU3RBLBAM5QFGQDRDJKHCOAJAWTZHQ"}

progress/SpecForge/cache/compiled_kernels/3f/c3fttv7enp2yvnla3r6jkk4galt2qdpxw577ghvkmmx6zqaqla74.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 524288, 'r0_': 32},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i64', 'ks1': 'i64', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=5, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_2', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': None, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr1, ks0, ks1, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 32
+    R0_BLOCK: tl.constexpr = 32
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_2 = r0_index
+    x5 = xindex
+    x1 = xindex // 128
+    x0 = (xindex % 128)
+    x3 = ((xindex // 128) % ks0)
+    x4 = xindex // ks1
+    tmp0 = tl.load(in_ptr0 + (x5 + 4096*ks0*r0_2), None)
+    tmp1 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last')
+    tmp4 = tl.load(in_ptr2 + (x1 + 32*ks0*r0_2), None, eviction_policy='evict_last')
+    tmp13 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last')
+    tmp2 = float("-inf")
+    tmp3 = tmp1 == tmp2
+    tmp5 = tmp4 - tmp1
+    tmp6 = 0.0
+    tmp7 = tl.where(tmp3, tmp6, tmp5)
+    tmp8 = libdevice.exp2(tmp7)
+    tmp9 = tmp0 * tmp8
+    tmp10 = tl.broadcast_to(tmp9, [XBLOCK, R0_BLOCK])
+    tmp12 = tl.sum(tmp10, 1)[:, None].to(tl.float32)
+    tmp14 = 1.0
+    tmp15 = tl.where(tmp3, tmp14, tmp13)
+    tmp16 = (tmp12 / tmp15)
+    tmp17 = tmp16.to(tl.float32)
+    tl.store(out_ptr1 + (x0 + 128*x4 + 4096*x3), tmp17, None)

progress/SpecForge/cache/compiled_kernels/3n/c3nlaqknekmjv2zuxzow4rf42v3gorxnfp6uod3dg3ic5ibp6yp3.py ADDED Viewed

	@@ -0,0 +1,715 @@

+# AOT ID: ['1_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l7/cl76p6rje3cyrrbyvxjjj7oxbieltfs4p5xqjre35l6wnofhynby.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+# Graph fragment:
+#   %arg1_1 : Tensor "bf16[1, 32, s37, 128][4096*s37, 128, 4096, 1]cuda:1" = PlaceHolder[target=arg1_1]
+#   %arg3_1 : Tensor "bf16[1, 8, s0, 128][1024*s0, 128, 1024, 1]cuda:1" = PlaceHolder[target=arg3_1]
+#   %arg5_1 : Tensor "bf16[1, 8, s43, 128][1024*s43, 128, 1024, 1]cuda:1" = PlaceHolder[target=arg5_1]
+#   %getitem_1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:1" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 32, s37][32*s37, s37, 1]cuda:1" = PlaceHolder[target=buf1]
+#   %arg9_1 : Tensor "i32[1, 1, 5][5, 5, 1]cuda:1" = PlaceHolder[target=arg9_1]
+#   %arg6_1 : Tensor "i32[1, 1, 5, 5][25, 25, 5, 1]cuda:1" = PlaceHolder[target=arg6_1]
+#   %arg10_1 : Tensor "i32[1, 1, 5][5, 5, 1]cuda:1" = PlaceHolder[target=arg10_1]
+#   %arg11_1 : Tensor "i32[1, 1, 5, 5][25, 25, 5, 1]cuda:1" = PlaceHolder[target=arg11_1]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg1_1, %arg3_1, %arg5_1, %sdpa_score0, (%arg7_1, %arg8_1, %arg9_1, %arg6_1, %arg10_1, %arg11_1, %arg12_1, %arg13_1, %arg14_1, %arg15_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %getitem
+triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_MAX': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32'}, 'device': DeviceProperties(type='cuda', index=1, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'USE_TMA': False, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    MAX = arg_MAX
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks2, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    stride_kv_num_blks_h = 5
+    stride_kv_idx_h = 25
+    stride_kv_idx_m = 5
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    # K and V pointers will be passed directly to forward_inner
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_zq*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 4096*idx_m, acc.shape)), acc, mask)
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'ieee'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1, ks2,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ft/cftmlennkcgyn4ynz7zxqohr2jlirziu3mfte3b4eg5y2466jcwm.py
+# Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+#   lse_scaled => mul_9
+# Graph fragment:
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %mul_9 : Tensor "f32[1, 32, s37][32*Max(1, s37), Max(1, s37), 1]cuda:1"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%getitem_1, 0.6931471805599453), kwargs = {})
+#   return %mul_9
+triton_poi_fused_mul_1 = async_compile.triton('triton_poi_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 32768},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=1, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1 = args
+        args.clear()
+        s50 = arg0_1
+        s0 = arg2_1
+        s43 = arg4_1
+        s37 = arg7_1
+        s71 = arg8_1
+        assert_size_stride(arg1_1, (1, 32, s37, 128), (4096*s37, 128, 4096, 1))
+        assert_size_stride(arg3_1, (1, 8, s0, 128), (1024*s0, 128, 1024, 1))
+        assert_size_stride(arg5_1, (1, 8, s43, 128), (1024*s43, 128, 1024, 1))
+        assert_size_stride(arg6_1, (1, 1, 5, 5), (25, 25, 5, 1))
+        assert_size_stride(arg9_1, (1, 1, 5), (5, 5, 1))
+        assert_size_stride(arg10_1, (1, 1, 5), (5, 5, 1))
+        assert_size_stride(arg11_1, (1, 1, 5, 5), (25, 25, 5, 1))
+        assert_size_stride(arg12_1, (1, 1, 5), (5, 5, 1))
+        assert_size_stride(arg13_1, (1, 1, 5, 5), (25, 25, 5, 1))
+        assert_size_stride(arg14_1, (1, 1, 5), (5, 5, 1))
+        assert_size_stride(arg15_1, (1, 1, 5, 5), (25, 25, 5, 1))
+        with torch.cuda._DeviceGuard(1):
+            torch.cuda.set_device(1)
+            buf0 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf1 = empty_strided_cuda((1, 32, s37), (32*s37, s37, 1), torch.float32)
+            buf2 = empty_strided_cuda((1, 32, s37, 128), (4096*s37, 128, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            stream1 = get_raw_stream(1)
+            triton_tem_fused_0.run(arg1_1, arg3_1, arg5_1, buf0, buf1, arg9_1, arg6_1, arg10_1, arg11_1, buf2, s37, s0, s43, (127 + s37) // 128, 1, 32, stream=stream1)
+            del arg10_1
+            del arg11_1
+            del arg1_1
+            del arg3_1
+            del arg5_1
+            del arg6_1
+            del arg9_1
+            del buf1
+            buf5 = empty_strided_cuda((1, 32, s37), (32*max(1, s37), max(1, s37), 1), torch.float32)
+            # Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+            triton_poi_fused_mul_1_xnumel = 32*s37
+            stream1 = get_raw_stream(1)
+            triton_poi_fused_mul_1.run(buf0, buf5, s37, triton_poi_fused_mul_1_xnumel, stream=stream1)
+            del buf0
+        return (buf2, buf5, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 528
+    arg1_1 = rand_strided((1, 32, 528, 128), (2162688, 128, 4096, 1), device='cuda:1', dtype=torch.bfloat16)
+    arg2_1 = 528
+    arg3_1 = rand_strided((1, 8, 528, 128), (540672, 128, 1024, 1), device='cuda:1', dtype=torch.bfloat16)
+    arg4_1 = 528
+    arg5_1 = rand_strided((1, 8, 528, 128), (540672, 128, 1024, 1), device='cuda:1', dtype=torch.bfloat16)
+    arg6_1 = rand_strided((1, 1, 5, 5), (25, 25, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg7_1 = 528
+    arg8_1 = 528
+    arg9_1 = rand_strided((1, 1, 5), (5, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg10_1 = rand_strided((1, 1, 5), (5, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg11_1 = rand_strided((1, 1, 5, 5), (25, 25, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg12_1 = rand_strided((1, 1, 5), (5, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg13_1 = rand_strided((1, 1, 5, 5), (25, 25, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg14_1 = rand_strided((1, 1, 5), (5, 5, 1), device='cuda:1', dtype=torch.int32)
+    arg15_1 = rand_strided((1, 1, 5, 5), (25, 25, 5, 1), device='cuda:1', dtype=torch.int32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 65536, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr1': '*fp32', 'ks0': 'i64', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=3, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr1, ks0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % ks0)
+    x1 = triton_helpers.div_floor_integer(xindex,  ks0)
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_2 + 128*x0 + 128*x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp2 = tmp0 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask & xmask, tmp5, _tmp4)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tmp7 = tl.load(in_ptr2 + (x0 + x1*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1)))), xmask, eviction_policy='evict_last')
+    tmp6 = tmp4.to(tl.float32)
+    tmp8 = 0.6931471805599453
+    tmp9 = tmp7 * tmp8
+    tmp10 = 1.4426950408889634
+    tmp11 = tmp9 * tmp10
+    tmp12 = tmp6 - tmp11
+    tl.store(out_ptr1 + (x3), tmp12, xmask)

progress/SpecForge/cache/compiled_kernels/3q/fc5920467dd1501963c976e2b895fc37747fdebfa098fff912209055f3a31828.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 64, "R0_BLOCK": 64, "num_warps": 16, "num_stages": 1, "configs_hash": "2685c2d349c32243d4ee216505dfdf1e257d04d8316595ed69d4ca3499146788", "found_by_coordesc": false, "time_taken_ms": 53, "triton_cache_hash": "GBIQTIXLLLI56EMJONBW74RZJ42E6PTSU5N7LA23N4VBEBKK3HNQ"}

progress/SpecForge/cache/compiled_kernels/3r/c3rkwwyedldrjz6sidtx5huqcsdgpdpu4xndmm6h4e4boo6cbg2w.py ADDED Viewed

	@@ -0,0 +1,702 @@

+# AOT ID: ['0_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/nj/cnjtse3xftpnmqvwojj6g7ajl3r3hvxbz3sgyaaznnrxcs7gzj2e.py
+# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+# Source node to ATen node mapping:
+#   flex_attention => flex_attention
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 32, 976, 128][3997696, 128, 4096, 1]cuda:7" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 8, 976, 128][999424, 128, 1024, 1]cuda:7" = PlaceHolder[target=arg1_1]
+#   %arg2_1 : Tensor "bf16[1, 8, 976, 128][999424, 128, 1024, 1]cuda:7" = PlaceHolder[target=arg2_1]
+#   %getitem_1 : Tensor "f32[1, 32, 976][31232, 976, 1]cuda:7" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 32, 976][31232, 976, 1]cuda:7" = PlaceHolder[target=buf1]
+#   %arg3_1 : Tensor "i32[1, 1, 8][8, 8, 1]cuda:7" = PlaceHolder[target=arg3_1]
+#   %arg4_1 : Tensor "i32[1, 1, 8, 8][64, 64, 8, 1]cuda:7" = PlaceHolder[target=arg4_1]
+#   %arg5_1 : Tensor "i32[1, 1, 8][8, 8, 1]cuda:7" = PlaceHolder[target=arg5_1]
+#   %arg6_1 : Tensor "i32[1, 1, 8, 8][64, 64, 8, 1]cuda:7" = PlaceHolder[target=arg6_1]
+#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%arg0_1, %arg1_1, %arg2_1, %sdpa_score0, (976, 976, %arg3_1, %arg4_1, %arg5_1, %arg6_1, %arg7_1, %arg8_1, %arg9_1, %arg10_1, 128, 128, %sdpa_mask0), 0.08838834764831845, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True, OUTPUT_MAX: False}, (), ()), kwargs = {})
+#   return %getitem
+triton_tem_fused_0 = async_compile.triton('triton_tem_fused_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_MAX': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*bf16'}, 'device': DeviceProperties(type='cuda', index=7, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_0', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'USE_TMA': False, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_0(arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    MAX = arg_MAX
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = 3997696, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 999424, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 999424, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    Q_LEN = 976
+    ZKV = 1
+    KV_LEN = 976
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    stride_kv_num_blks_h = 8
+    stride_kv_idx_h = 64
+    stride_kv_idx_m = 8
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    # K and V pointers will be passed directly to forward_inner
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0,
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0,
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    xindex = idx_d + 128*idx_m + 124928*idx_hq + 3997696*idx_zq
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 4096*idx_m, acc.shape)), acc, mask)
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    USE_TMA : tl.constexpr = False
+    BLOCK_M : tl.constexpr = 128
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_MAX, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i
+''', device_str='cuda')
+# kernel path: /workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/tf/ctfmgr5xiespvzijrmhgbal75r2upp6hcalbhblnugpejgipxlrx.py
+# Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+# Source node to ATen node mapping:
+#   lse_scaled => mul
+# Graph fragment:
+#   %buf3 : Tensor  = PlaceHolder[target=buf3]
+#   %mul : Tensor "f32[1, 32, 976][31232, 976, 1]cuda:7"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%getitem_1, 0.6931471805599453), kwargs = {})
+#   return %mul
+triton_poi_fused_mul_1 = async_compile.triton('triton_poi_fused_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 32768},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=7, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'tiling_scores': {'x': 249856}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 31232
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask)
+    tmp1 = 0.6931471805599453
+    tmp2 = tmp0 * tmp1
+    tl.store(out_ptr0 + (x0), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 32, 976, 128), (3997696, 128, 4096, 1))
+        assert_size_stride(arg1_1, (1, 8, 976, 128), (999424, 128, 1024, 1))
+        assert_size_stride(arg2_1, (1, 8, 976, 128), (999424, 128, 1024, 1))
+        assert_size_stride(arg3_1, (1, 1, 8), (8, 8, 1))
+        assert_size_stride(arg4_1, (1, 1, 8, 8), (64, 64, 8, 1))
+        assert_size_stride(arg5_1, (1, 1, 8), (8, 8, 1))
+        assert_size_stride(arg6_1, (1, 1, 8, 8), (64, 64, 8, 1))
+        assert_size_stride(arg7_1, (1, 1, 8), (8, 8, 1))
+        assert_size_stride(arg8_1, (1, 1, 8, 8), (64, 64, 8, 1))
+        assert_size_stride(arg9_1, (1, 1, 8), (8, 8, 1))
+        assert_size_stride(arg10_1, (1, 1, 8, 8), (64, 64, 8, 1))
+        with torch.cuda._DeviceGuard(7):
+            torch.cuda.set_device(7)
+            buf0 = empty_strided_cuda((1, 32, 976), (31232, 976, 1), torch.float32)
+            buf1 = empty_strided_cuda((1, 32, 976), (31232, 976, 1), torch.float32)
+            buf2 = empty_strided_cuda((1, 32, 976, 128), (3997696, 128, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
+            stream7 = get_raw_stream(7)
+            triton_tem_fused_0.run(arg0_1, arg1_1, arg2_1, buf0, buf1, arg3_1, arg4_1, arg5_1, arg6_1, buf2, 8, 1, 32, stream=stream7)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+            del arg3_1
+            del arg4_1
+            del arg5_1
+            del arg6_1
+            buf5 = buf1; del buf1  # reuse
+            # Topologically Sorted Source Nodes: [lse_scaled], Original ATen: [aten.mul]
+            stream7 = get_raw_stream(7)
+            triton_poi_fused_mul_1.run(buf0, buf5, 31232, stream=stream7)
+            del buf0
+        return (buf2, buf5, )
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 32, 976, 128), (3997696, 128, 4096, 1), device='cuda:7', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 8, 976, 128), (999424, 128, 1024, 1), device='cuda:7', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 8, 976, 128), (999424, 128, 1024, 1), device='cuda:7', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg4_1 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg5_1 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg6_1 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg7_1 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg8_1 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg9_1 = rand_strided((1, 1, 8), (8, 8, 1), device='cuda:7', dtype=torch.int32)
+    arg10_1 = rand_strided((1, 1, 8, 8), (64, 64, 8, 1), device='cuda:7', dtype=torch.int32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

progress/SpecForge/cache/compiled_kernels/3z/c3zi2pt6zmbthc6ythgt5p4ednhp6m24gpscb2pt6adf6xojetua.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=8,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_DELTA': '*fp32', 'arg_DO': '*bf16', 'arg_DQ': '*bf16', 'arg_DV': '*bf16', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_Q_NUM_BLKS': '*i32', 'arg_Q_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'arg_FULL_Q_NUM_BLKS': '*i32', 'arg_FULL_Q_IDX': '*i32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32'}, 'device': DeviceProperties(type='cuda', index=5, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]], (13,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]], (15,): [['tt.divisibility', 16]], (16,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'triton_tem_fused_mul_1', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'SM_SCALE': 0.08838834764831845, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}},
+)
+@triton.jit
+def triton_tem_fused_mul_1(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    LSE = arg_LSE
+    DELTA = arg_DELTA
+    DO = arg_DO
+    DQ = arg_DQ
+    DV = arg_DV
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    Q_NUM_BLKS = arg_Q_NUM_BLKS
+    Q_IDX = arg_Q_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    FULL_Q_NUM_BLKS = arg_FULL_Q_NUM_BLKS
+    FULL_Q_IDX = arg_FULL_Q_IDX
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = 4096*ks0, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kd = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vd = 1024*ks1, 128, 1024, 1
+    stride_doz, stride_doh, stride_dom, stride_dod = 4096*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128*((1) * ((1) >= (ks0)) + (ks0) * ((ks0) > (1))), 128, 1
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = 4096*ks0, 128, 4096, 1
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = 1024*ks1, 128, 1024, 1
+    ZQ = 1
+    HQ = 32
+    HKV = 8
+    Q_LEN = ks0
+    ZKV = 1
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    pid = tl.program_id(0).to(INDEX_DTYPE)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_zq % SPARSE_Z
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = 1
+        stride_kv_idx_h = 1
+        stride_kv_idx_m = 1
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+        stride_q_num_blks_h = 1
+        stride_q_idx_h = 1
+        stride_q_idx_n = 1
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+        dk *= SM_SCALE
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
+        xindex = index_k + 128*index_n + 128*off_hkv*ks1 + 1024*off_zq*ks1
+        tl.store(out_ptr0 + (tl.broadcast_to(index_k + 128*off_hkv + 1024*index_n, dk.shape)), dk, mask)
+@triton.jit
+def bwd_dq_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
+        offs_n2 += offset
+    return dq
+@triton.jit
+def bwd_dq_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp21 = (ds)
+    grad_scores = tmp21
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = (offs_m2[:, None] < Q_LEN ) & (offs_n2[None, :] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+    return dq
+@triton.jit
+def bwd_dkdv_inner(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = ks0
+    KV_LEN = ks1
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
+    return dk, dv
+@triton.jit
+def bwd_dkdv_block_mn(
+    arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, ks0, ks1,
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS,
+):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M1 : tl.constexpr = 64
+    BLOCK_N1 : tl.constexpr = 128
+    BLOCK_M2 : tl.constexpr = 128
+    BLOCK_N2 : tl.constexpr = 64
+    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
+    pre_mod_scores = qkT
+    tmp22 = (qkT)
+    post_mod_scores = tmp22
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp23 = (m)
+        tmp24 = tl.full([1], 0, tl.int32)
+        tmp25 = tmp23 < tmp24
+        tmp26 = (n)
+        tmp27 = tmp26 <= tmp23
+        tmp28 = tmp25 & tmp27
+        tmp29 = tmp23 >= tmp24
+        tmp30 = tmp26 < tmp24
+        tmp31 = tmp29 & tmp30
+        tmp32 = tmp30 == 0
+        tmp33 = tmp29 & tmp32
+        tmp34 = tmp23 - tmp24
+        tmp35 = tl.full([1], 16, tl.int32)
+        tmp36 = tl.where((tmp34 < 0) != (tmp35 < 0), tl.where(tmp34 % tmp35 != 0, tmp34 // tmp35 - 1, tmp34 // tmp35), tmp34 // tmp35)
+        tmp37 = tmp26 - tmp24
+        tmp38 = tl.where((tmp37 < 0) != (tmp35 < 0), tl.where(tmp37 % tmp35 != 0, tmp37 // tmp35 - 1, tmp37 // tmp35), tmp37 // tmp35)
+        tmp39 = tmp36 == tmp38
+        tmp40 = tmp33 & tmp39
+        tmp41 = tmp31 | tmp40
+        tmp42 = tmp28 | tmp41
+        mask_mod_output = tmp42
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    tmp43 = (dsT)
+    grad_scores = tmp43
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = (offs_m1[None, :] < Q_LEN) & (offs_n1[:, None] < KV_LEN)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+    return dk, dv
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)

progress/SpecForge/cache/compiled_kernels/3z/c3zilfzjywngbdehwphwkhzpt6qcv6jecvzdajl2d5hb73xe6yzw.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+@triton_heuristics.template(
+num_stages=3,
+num_warps=2,
+triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_M': '*fp32', 'arg_L': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'out_ptr0': '*fp32', 'ks0': 'i32', 'ks1': 'i32'}, 'device': DeviceProperties(type='cuda', index=4, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
+inductor_meta={'kernel_name': 'Placeholder.DESCRIPTIVE_NAME', 'backend_hash': 'B0E5936CA26D1BCD1B577D0B65F13FE7553DC941EB93358973E9F1902BC212C5', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'grid_type': 'FixedGrid', 'fixed_grid': ['_grid_0', '_grid_1', '_grid_2'], 'extra_launcher_args': ['_grid_0', '_grid_1', '_grid_2'], 'config_args': {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False, 'FLOAT32_PRECISION': "'tf32'", 'IS_DIVISIBLE': False, 'GQA_SHARED_HEADS': 4, 'HAS_FULL_BLOCKS': True, 'SM_SCALE': 0.08838834764831845, 'SPLIT_KV': 32, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M': 512, 'SAFE_M_BOUNDARY': False, 'SAFE_N_BOUNDARY': True, 'BLOCK_N': 64, 'SPARSE_KV_BLOCK_SIZE': 128, 'USE_TMA': False}},
+)
+@triton.jit
+def triton_flex_decoding(arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1):
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    SPLIT_KV : tl.constexpr = 32
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M : tl.constexpr = 512
+    SAFE_M_BOUNDARY : tl.constexpr = False
+    SAFE_N_BOUNDARY : tl.constexpr = True
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    USE_TMA : tl.constexpr = False
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    Q = arg_Q
+    K = arg_K
+    V = arg_V
+    M = arg_M
+    L = arg_L
+    KV_NUM_BLKS = arg_KV_NUM_BLKS
+    KV_IDX = arg_KV_IDX
+    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
+    FULL_KV_IDX = arg_FULL_KV_IDX
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
+    # (Modifiable) Config options:
+    # SPLIT_KV: number of blocks K & V are split into
+    # TILE_KV: length of each local KV split
+    # BLOCK_M: block size that Q is padded along seqlen dim.
+    # BLOCK_N: block size of K & V along N dimension.
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
+    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
+    #
+    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    #
+    #
+    # Output: ACC output accumulated across local KV split.
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define Q Strides
+    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = 4096*ks0, 512, 128, 4096, 1
+    stride_kz, stride_kh, stride_kn, stride_kk = 1024*ks1, 128, 1024, 1
+    stride_vz, stride_vh, stride_vn, stride_vk = 1024*ks1, 128, 1024, 1
+    stride_mz, stride_mt, stride_mh, stride_mm = 1024*ks0, 32*ks0, ks0, 1
+    stride_lz, stride_lt, stride_lh, stride_lm = 1024*ks0, 32*ks0, ks0, 1
+    Z = 1
+    ZKV = 1
+    HKV = 8
+    G: tl.constexpr = GQA_SHARED_HEADS
+    HQ = HKV * G
+    Q_LEN = ks0
+    KV_LEN = ks1
+    MATMUL_PRECISION = Q.dtype.element_ty
+    # Make sure each split is a multiple of BLOCK_N
+    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
+    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
+    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
+    off_z = tl.program_id(0).to(INDEX_DTYPE) // HKV
+    off_zkv = off_z % ZKV
+    off_hkv = tl.program_id(0).to(INDEX_DTYPE) % HKV
+    off_t = tl.program_id(1).to(INDEX_DTYPE)
+    q_offset = off_z * stride_qz + off_hkv * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+    K = K + k_offset
+    V = V + v_offset
+    SPARSE_Z = 1
+    SPARSE_HQ = 1
+    sparse_idx_z = off_z % SPARSE_Z
+    sparse_idx_h = off_hkv % SPARSE_HQ
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+    # initialize offsets
+    tl.device_assert(BLOCK_M % G == 0)
+    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
+    off_g = tl.arange(0, G)                                                 # [G]
+    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_hq = offs_g + off_hkv * G
+    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
+    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_d = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_vd = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    # Get HZ offsets for KV_NUM_BLKS and KV_IDX
+    stride_block_z, stride_block_h, stride_block_row = 1, 1, 1
+    sparse_block_hz_offset = sparse_idx_z * stride_block_z + sparse_idx_h * stride_block_h
+    stride_kv_z, stride_kv_h, stride_kv_row, stride_kv_col = 1, 1, 1, 1
+    sparse_idx_hz_offset = sparse_idx_z * stride_kv_z + sparse_idx_h * stride_kv_h
+    # Calculate KV blocks that belong this CTA.
+    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
+    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
+    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
+    if not SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=(offs_d[None, None, :] < QK_HEAD_DIM) & (off_m[None, :, None] < Q_LEN))
+    elif SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=offs_d[None, None, :] < QK_HEAD_DIM)
+    elif not SAFE_M_BOUNDARY and SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=off_m[None, :, None] < Q_LEN)
+    else:
+        q = tl.load(Q + q_offset + q_range)
+    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM_ROUNDED])
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # find first kv block we are loading and the number of blocks we are loading
+    # Offset the kv_indices tensor by the correct batch and head
+    kv_indices = KV_IDX + sparse_idx_hz_offset
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_block_hz_offset)
+    MAX_KV_IDX = 1
+    indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
+    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+    # first kv block we're loading
+    # last valid block according to sparse mask
+    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+    desc_k = None
+    desc_v = None
+    acc, l_i, m_i = forward_inner(
+        arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+        q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+        # accumulatd values
+        acc, l_i, m_i,
+        #offsets
+        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+        off_n,
+        #block sparse data
+        kv_indices, kv_num_blocks,
+        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        kv_indices = FULL_KV_IDX + sparse_idx_hz_offset
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_block_hz_offset)
+        # Assign full block in a reverse order for off_t. Prioritize the last CTA.
+        block_n_start = (SPLIT_KV - off_t - 1) * TILE_KV_MULTIPLE
+        block_n_end = block_n_start + TILE_KV_MULTIPLE
+        indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
+        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+        # last valid block according to sparse mask
+        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        offs_n = tl.arange(0, BLOCK_N) + off_n
+        acc, l_i, m_i = forward_inner(
+            arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+            q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+            # accumulatd values
+            acc, l_i, m_i,
+            #offsets
+            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+            off_n,
+            #block sparse data
+            kv_indices, kv_num_blocks,
+            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+    m_offset = off_t * stride_mt + off_z * stride_mz
+    l_offset = off_t * stride_lt + off_z * stride_lz
+    M_block_ptr = tl.make_block_ptr(
+        base=M + m_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_mh, stride_mm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    L_block_ptr = tl.make_block_ptr(
+        base=L + l_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_lh, stride_lm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
+    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
+    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
+    if SAFE_M_BOUNDARY:
+        tl.store(M_block_ptr, m_i)
+        tl.store(L_block_ptr, l_i)
+    else:
+        tl.store(M_block_ptr, m_i, boundary_check=(1,))
+        tl.store(L_block_ptr, l_i, boundary_check=(1,))
+    # -- store output
+    idx_z = off_z
+    idx_t = off_t
+    idx_hq = off_hkv*G + off_g[:, None, None]
+    idx_m = off_m[None, :, None]
+    idx_d = offs_vd[None, None, :]
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
+    xindex = idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_t*ks0 + 131072*idx_z*ks0
+    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_m + 128*idx_hq*ks0 + 4096*idx_t*ks0, acc.shape)), acc, mask)
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_LEN: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    SPLIT_KV : tl.constexpr = 32
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M : tl.constexpr = 512
+    SAFE_M_BOUNDARY : tl.constexpr = False
+    SAFE_N_BOUNDARY : tl.constexpr = True
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    USE_TMA : tl.constexpr = False
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    tmp0 = (qk)
+    post_mod_scores = tmp0
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        tmp1 = (m)
+        tmp2 = tl.full([1], 0, tl.int32)
+        tmp3 = tmp1 < tmp2
+        tmp4 = (n)
+        tmp5 = tmp4 <= tmp1
+        tmp6 = tmp3 & tmp5
+        tmp7 = tmp1 >= tmp2
+        tmp8 = tmp4 < tmp2
+        tmp9 = tmp7 & tmp8
+        tmp10 = tmp8 == 0
+        tmp11 = tmp7 & tmp10
+        tmp12 = tmp1 - tmp2
+        tmp13 = tl.full([1], 16, tl.int32)
+        tmp14 = tl.where((tmp12 < 0) != (tmp13 < 0), tl.where(tmp12 % tmp13 != 0, tmp12 // tmp13 - 1, tmp12 // tmp13), tmp12 // tmp13)
+        tmp15 = tmp4 - tmp2
+        tmp16 = tl.where((tmp15 < 0) != (tmp13 < 0), tl.where(tmp15 % tmp13 != 0, tmp15 // tmp13 - 1, tmp15 // tmp13), tmp15 // tmp13)
+        tmp17 = tmp14 == tmp16
+        tmp18 = tmp11 & tmp17
+        tmp19 = tmp9 | tmp18
+        tmp20 = tmp6 | tmp19
+        mask_mod_output = tmp20
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def forward_inner(
+    arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    PRESCALE_QK : tl.constexpr = False
+    ROWS_GUARANTEED_SAFE : tl.constexpr = False
+    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
+    WRITE_DQ : tl.constexpr = True
+    OUTPUT_LOGSUMEXP : tl.constexpr = True
+    OUTPUT_MAX : tl.constexpr = False
+    FLOAT32_PRECISION : tl.constexpr = 'tf32'
+    IS_DIVISIBLE : tl.constexpr = False
+    GQA_SHARED_HEADS : tl.constexpr = 4
+    HAS_FULL_BLOCKS : tl.constexpr = True
+    SM_SCALE : tl.constexpr = 0.08838834764831845
+    SPLIT_KV : tl.constexpr = 32
+    QK_HEAD_DIM : tl.constexpr = 128
+    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    V_HEAD_DIM : tl.constexpr = 128
+    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
+    SAFE_HEAD_DIM : tl.constexpr = True
+    BLOCK_M : tl.constexpr = 512
+    SAFE_M_BOUNDARY : tl.constexpr = False
+    SAFE_N_BOUNDARY : tl.constexpr = True
+    BLOCK_N : tl.constexpr = 64
+    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
+    USE_TMA : tl.constexpr = False
+    INDEX_DTYPE : tl.constexpr = tl.int32
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    kv_offset = 0
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                arg_Q, arg_K, arg_V, arg_M, arg_L, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, out_ptr0, ks0, ks1,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+        offs_n = offs_n + offset
+        kv_offset += offset
+    return acc, l_i, m_i

progress/SpecForge/cache/compiled_kernels/4a/7887d45b1aa6124e232769adbe995f9cc2af0dd187cb9928540172d82c7b8631.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b159e4046c056f195ca1ccf2464d5b37d1", "found_by_coordesc": false, "time_taken_ms": 11, "triton_cache_hash": "BZAXIZYYJGUVREZ5ANMEKVK5UU77TPVNED7QAB22EKNJIKFVURYA"}